• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define PRIVATE(f) .text; .align 4; .type f,#function; f:
19#define END(f) .size f, .-f;
20
21.set FRACTION_BITS, 7
22.set MAX_R, 25
23
24
25/* A quick way of making a line of code conditional on some other condition.
26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
27 * `ifcc`:
28 */
29.macro ifcc zzz:vararg
30.if cc
31            \zzz
32.endif
33.endm
34
35/* Fetch 16 columns of bytes (regardless of image format), convolve these
36 * vertically, and leave them in the register file.  If working near the top or
37 * bottom of an image then clamp the addressing while loading the data in.
38 *
39 * The convolution is fully unrolled for windows up to max_r, with the
40 * outermost edges calculated first.  This way it's possible to branch directly
41 * into the relevant part of the code for an arbitrary convolution radius.  Two
42 * variants of the loop are produced; one eliminates the clamping code for a
43 * slight speed advantage.
44 *
45 * Where the macro is called with reg=x, the specified register is taken to
46 * contain a pre-calculated pointer into one of the two loops.
47 *
48 * Input:
49 *      x1 -- src
50 *      x2 -- pitch
51 *      x5 -- r
52 *      x6 -- rup
53 *      x7 -- rdn
54 *      x12 -- switch index
55 *      q0-q3 -- coefficient table
56 *      x13 = -pitch
57 *      x15 = top-row in
58 *      x19 = bottom-row in
59 * Output:
60 *      x1 += 16
61 *      q10,q11 -- 16 convolved columns
62 * Modifies:
63 *      x10 = upper row pointer
64 *      x11 = lower row pointer
65 *      q12-q15 = temporary sums
66 */
67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
68  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
69
70            ld1         {v15.16b}, [x1], #16
71            mov         x10, x15
72
73            uxtl        v14.8h, v15.8b
74//            prfm        PLDL1KEEP,[x1, #16] // TODO: confirm
75            uxtl2       v15.8h, v15.16b
76  .if \max_r < 16 // approximate
77    ifcc    adr         \reg, 1f
78  .else
79    ifcc    adrp        \reg, 1f
80    ifcc    add         \reg, \reg, #:lo12:1f
81  .endif
82
83            umull       v12.4s, v14.4h, v0.h[0]
84    ifcc    sub         \reg, \reg, x5, LSL #6
85            umull2      v13.4s, v14.8h, v0.h[0]
86            mov         x11, x19
87            umull       v14.4s, v15.4h, v0.h[0]
88    ifcc    add         \reg, \reg, x5, LSL #3
89            umull2      v15.4s, v15.8h, v0.h[0]
90            br          \reg
91
92  .irp rowclamp, 1, 0
93    .set cc, \rowclamp
94    .align 4
95    .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h
96        .set i, \dreg * 8 + \lane
97        .if 0 < i && i <= \max_r
98            ld1         {v10.16b}, [x10], x2
99    ifcc    cmp         x6, #i
100            ld1         {v11.16b}, [x11], x13
101    ifcc    csel        x10, x15, x10, lo
102            uaddl       v16.8h, v10.8b, v11.8b
103    ifcc    cmp         x7, #i
104            uaddl2      v11.8h, v10.16b, v11.16b
105    ifcc    csel        x11, x19, x11, lo
106            umlal       v12.4s, v16.4h, v\dreg\doth[\lane]
107            umlal2      v13.4s, v16.8h, v\dreg\doth[\lane]
108//            prfm        PLDL1KEEP,[x10, #32] // TODO: confirm
109nop
110            umlal       v14.4s, v11.4h, v\dreg\doth[\lane]
111//            prfm        PLDL1KEEP,[x11, #32] // TODO: confirm
112nop
113            umlal2      v15.4s, v11.8h, v\dreg\doth[\lane]
114        .endif
115    .endr ; .endr ; .endr
116    .if \rowclamp == 1
117        1: \labelc :
118            b           2f
119    .else
120        2: \labelnc :
121    .endif
122  .endr
123
124            uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
125            add         x15, x15, #16
126            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
127            add         x19, x19, #16
128            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
129            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
130.endm /*}}}*/
131
132/* Some portion of the convolution window (as much as will fit, and all of it
133 * for the uchar1 cases) is kept in the register file to avoid unnecessary
134 * memory accesses.  This forces the horizontal loops to be unrolled because
135 * there's no indexed addressing into the register file.
136 *
137 * As in the fetch macro, the operations are ordered from outside to inside, so
138 * that jumping into the middle of the block bypasses the unwanted window taps.
139 *
140 * There are several variants of the macro because of the fixed offets of the
141 * taps -- the wider the maximum radius the further the centre tap is from the
142 * most recently fetched data.  This means that pre-filling the window requires
143 * more data that won't be used and it means that rotating the window involves
144 * more mov operations.
145 *
146 * When the buffer gets too big the buffer at [x9] is used.
147 *
148 * Input:
149 *      q4-q11 -- convoltion window
150 *      x9 -- pointer to additional convolution window data
151 * Output:
152 *      x9 -- updated buffer pointer (if used)
153 *      d31 -- result to be stored
154 * Modifies:
155 *      x12 -- temp buffer pointer
156 *      q12-q13 -- temporaries for load and vext operations.
157 *      q14-q15 -- intermediate sums
158 */
159#define TUNED_LIST1 8, 16
160.macro hconv1_8/*{{{*/
161            umull       v14.4s, v9.4h, v0.h[0]
162            umull2      v15.4s, v9.8h, v0.h[0]
163
164            adr         x16, 100f
165            ldrsh       x12, [x16, x5, LSL #1]
166            add         x12, x12, x16
167            br          x12
168   100:     .hword -4
169            .hword 101f-100b
170            .hword 102f-100b
171            .hword 103f-100b
172            .hword 104f-100b
173            .hword 105f-100b
174            .hword 106f-100b
175            .hword 107f-100b
176            .hword 108f-100b
177            .align      4
178    108:    umlal       v14.4s, v8.4h, v1.h[0]
179            umlal2      v15.4s, v8.8h, v1.h[0]
180            umlal       v14.4s, v10.4h, v1.h[0]
181            umlal2      v15.4s, v10.8h, v1.h[0]
182    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
183            ext         v13.16b, v9.16b, v10.16b, #7*2
184            umlal       v14.4s, v12.4h, v0.h[7]
185            umlal2      v15.4s, v12.8h, v0.h[7]
186            umlal       v14.4s, v13.4h, v0.h[7]
187            umlal2      v15.4s, v13.8h, v0.h[7]
188    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
189            ext         v13.16b, v9.16b, v10.16b, #6*2
190            umlal       v14.4s, v12.4h, v0.h[6]
191            umlal2      v15.4s, v12.8h, v0.h[6]
192            umlal       v14.4s, v13.4h, v0.h[6]
193            umlal2      v15.4s, v13.8h, v0.h[6]
194    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
195            ext         v13.16b, v9.16b, v10.16b, #5*2
196            umlal       v14.4s, v12.4h, v0.h[5]
197            umlal2      v15.4s, v12.8h, v0.h[5]
198            umlal       v14.4s, v13.4h, v0.h[5]
199            umlal2      v15.4s, v13.8h, v0.h[5]
200    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
201            //ext         v13.16b, v9.16b, v10.16b, #4*2
202            umlal2      v14.4s, v8.8h, v0.h[4]
203            umlal       v15.4s, v9.4h, v0.h[4]
204            umlal2      v14.4s, v9.8h, v0.h[4]
205            umlal       v15.4s, v10.4h, v0.h[4]
206    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
207            ext         v13.16b, v9.16b, v10.16b, #3*2
208            umlal       v14.4s, v12.4h, v0.h[3]
209            umlal2      v15.4s, v12.8h, v0.h[3]
210            umlal       v14.4s, v13.4h, v0.h[3]
211            umlal2      v15.4s, v13.8h, v0.h[3]
212    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
213            ext         v13.16b, v9.16b, v10.16b, #2*2
214            umlal       v14.4s, v12.4h, v0.h[2]
215            umlal2      v15.4s, v12.8h, v0.h[2]
216            umlal       v14.4s, v13.4h, v0.h[2]
217            umlal2      v15.4s, v13.8h, v0.h[2]
218    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
219            ext         v13.16b, v9.16b, v10.16b, #1*2
220            umlal       v14.4s, v12.4h, v0.h[1]
221            umlal2      v15.4s, v12.8h, v0.h[1]
222            umlal       v14.4s, v13.4h, v0.h[1]
223            umlal2      v15.4s, v13.8h, v0.h[1]
224
225            uqrshrn     v14.4h, v14.4s, #16
226            uqrshrn2    v14.8h, v15.4s, #16
227            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
228
229            mov         v8.16b, v9.16b
230            mov         v9.16b, v10.16b
231            mov         v10.16b, v11.16b
232.endm/*}}}*/
233
234.macro hconv1_16/*{{{*/
235            umull       v14.4s, v8.4h, v0.h[0]
236            umull2      v15.4s, v8.8h, v0.h[0]
237
238            adr         x16, 100f
239            ldrsh       x12, [x16, x5, LSL #1]
240            add         x12, x12, x16
241            br          x12
242   100:     .hword -4
243            .hword 101f-100b
244            .hword 102f-100b
245            .hword 103f-100b
246            .hword 104f-100b
247            .hword 105f-100b
248            .hword 106f-100b
249            .hword 107f-100b
250            .hword 108f-100b
251            .hword 109f-100b
252            .hword 110f-100b
253            .hword 111f-100b
254            .hword 112f-100b
255            .hword 113f-100b
256            .hword 114f-100b
257            .hword 115f-100b
258            .hword 116f-100b
259            .align 4
260    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
261            //ext         v13.16b, v10.16b, v11.16b, #0*2
262            umlal       v14.4s, v6.4h, v2.h[0]
263            umlal2      v15.4s, v6.8h, v2.h[0]
264            umlal       v14.4s, v10.4h, v2.h[0]
265            umlal2      v15.4s, v10.8h, v2.h[0]
266    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
267            ext         v13.16b, v9.16b, v10.16b, #7*2
268            umlal       v14.4s, v12.4h, v1.h[7]
269            umlal2      v15.4s, v12.8h, v1.h[7]
270            umlal       v14.4s, v13.4h, v1.h[7]
271            umlal2      v15.4s, v13.8h, v1.h[7]
272    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
273            ext         v13.16b, v9.16b, v10.16b, #6*2
274            umlal       v14.4s, v12.4h, v1.h[6]
275            umlal2      v15.4s, v12.8h, v1.h[6]
276            umlal       v14.4s, v13.4h, v1.h[6]
277            umlal2      v15.4s, v13.8h, v1.h[6]
278    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
279            ext         v13.16b, v9.16b, v10.16b, #5*2
280            umlal       v14.4s, v12.4h, v1.h[5]
281            umlal2      v15.4s, v12.8h, v1.h[5]
282            umlal       v14.4s, v13.4h, v1.h[5]
283            umlal2      v15.4s, v13.8h, v1.h[5]
284    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
285            //ext         v13.16b, v9.16b, v10.16b, #4*2
286            umlal2      v14.4s, v6.8h, v1.h[4]
287            umlal       v15.4s, v7.4h, v1.h[4]
288            umlal2      v14.4s, v9.8h, v1.h[4]
289            umlal       v15.4s, v10.4h, v1.h[4]
290    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
291            ext         v13.16b, v9.16b, v10.16b, #3*2
292            umlal       v14.4s, v12.4h, v1.h[3]
293            umlal2      v15.4s, v12.8h, v1.h[3]
294            umlal       v14.4s, v13.4h, v1.h[3]
295            umlal2      v15.4s, v13.8h, v1.h[3]
296    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
297            ext         v13.16b, v9.16b, v10.16b, #2*2
298            umlal       v14.4s, v12.4h, v1.h[2]
299            umlal2      v15.4s, v12.8h, v1.h[2]
300            umlal       v14.4s, v13.4h, v1.h[2]
301            umlal2      v15.4s, v13.8h, v1.h[2]
302    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
303            ext         v13.16b, v9.16b, v10.16b, #1*2
304            umlal       v14.4s, v12.4h, v1.h[1]
305            umlal2      v15.4s, v12.8h, v1.h[1]
306            umlal       v14.4s, v13.4h, v1.h[1]
307            umlal2      v15.4s, v13.8h, v1.h[1]
308    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
309            //ext         v13.16b, v9.16b, v10.16b, #0*2
310            umlal       v14.4s, v7.4h, v1.h[0]
311            umlal2      v15.4s, v7.8h, v1.h[0]
312            umlal       v14.4s, v9.4h, v1.h[0]
313            umlal2      v15.4s, v9.8h, v1.h[0]
314    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
315            ext         v13.16b, v8.16b, v9.16b, #7*2
316            umlal       v14.4s, v12.4h, v0.h[7]
317            umlal2      v15.4s, v12.8h, v0.h[7]
318            umlal       v14.4s, v13.4h, v0.h[7]
319            umlal2      v15.4s, v13.8h, v0.h[7]
320    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
321            ext         v13.16b, v8.16b, v9.16b, #6*2
322            umlal       v14.4s, v12.4h, v0.h[6]
323            umlal2      v15.4s, v12.8h, v0.h[6]
324            umlal       v14.4s, v13.4h, v0.h[6]
325            umlal2      v15.4s, v13.8h, v0.h[6]
326    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
327            ext         v13.16b, v8.16b, v9.16b, #5*2
328            umlal       v14.4s, v12.4h, v0.h[5]
329            umlal2      v15.4s, v12.8h, v0.h[5]
330            umlal       v14.4s, v13.4h, v0.h[5]
331            umlal2      v15.4s, v13.8h, v0.h[5]
332    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
333            //ext         v13.16b, v8.16b, v9.16b, #4*2
334            umlal2      v14.4s, v7.8h, v0.h[4]
335            umlal       v15.4s, v8.4h, v0.h[4]
336            umlal2      v14.4s, v8.8h, v0.h[4]
337            umlal       v15.4s, v9.4h, v0.h[4]
338    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
339            ext         v13.16b, v8.16b, v9.16b, #3*2
340            umlal       v14.4s, v12.4h, v0.h[3]
341            umlal2      v15.4s, v12.8h, v0.h[3]
342            umlal       v14.4s, v13.4h, v0.h[3]
343            umlal2      v15.4s, v13.8h, v0.h[3]
344    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
345            ext         v13.16b, v8.16b, v9.16b, #2*2
346            umlal       v14.4s, v12.4h, v0.h[2]
347            umlal2      v15.4s, v12.8h, v0.h[2]
348            umlal       v14.4s, v13.4h, v0.h[2]
349            umlal2      v15.4s, v13.8h, v0.h[2]
350    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
351            ext         v13.16b, v8.16b, v9.16b, #1*2
352            umlal       v14.4s, v12.4h, v0.h[1]
353            umlal2      v15.4s, v12.8h, v0.h[1]
354            umlal       v14.4s, v13.4h, v0.h[1]
355            umlal2      v15.4s, v13.8h, v0.h[1]
356
357            uqrshrn     v14.4h, v14.4s, #16
358            uqrshrn2    v14.8h, v15.4s, #16
359            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
360
361            mov         v6.16b, v7.16b
362            mov         v7.16b, v8.16b
363            mov         v8.16b, v9.16b
364            mov         v9.16b, v10.16b
365            mov         v10.16b, v11.16b
366.endm/*}}}*/
367
368.macro hconv1_25/*{{{*/
369            ext         v12.16b, v6.16b, v7.16b, #7*2
370            umull       v14.4s, v12.4h, v0.h[0]
371            umull2      v15.4s, v12.8h, v0.h[0]
372
373            adr         x16, 100f
374            ldrsh       x12, [x16, x5, LSL #1]
375            add         x12, x12, x16
376            br          x12
377   100:     .hword -4
378            .hword 101f-100b
379            .hword 102f-100b
380            .hword 103f-100b
381            .hword 104f-100b
382            .hword 105f-100b
383            .hword 106f-100b
384            .hword 107f-100b
385            .hword 108f-100b
386            .hword 109f-100b
387            .hword 110f-100b
388            .hword 111f-100b
389            .hword 112f-100b
390            .hword 113f-100b
391            .hword 114f-100b
392            .hword 115f-100b
393            .hword 116f-100b
394            .hword 117f-100b
395            .hword 118f-100b
396            .hword 119f-100b
397            .hword 120f-100b
398            .hword 121f-100b
399            .hword 122f-100b
400            .hword 123f-100b
401            .hword 124f-100b
402            .hword 125f-100b
403            .align 4
404    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
405            ext         v13.16b, v10.16b, v11.16b, #0*2
406            umlal       v14.4s, v12.4h, v3.h[1]
407            umlal2      v15.4s, v12.8h, v3.h[1]
408            umlal       v14.4s, v13.4h, v3.h[1]
409            umlal2      v15.4s, v13.8h, v3.h[1]
410    124:    ext         v12.16b, v3.16b, v4.16b, #7*2
411            ext         v13.16b, v9.16b, v10.16b, #7*2
412            umlal       v14.4s, v12.4h, v3.h[0]
413            umlal2      v15.4s, v12.8h, v3.h[0]
414            umlal       v14.4s, v13.4h, v3.h[0]
415            umlal2      v15.4s, v13.8h, v3.h[0]
416    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
417            ext         v13.16b, v9.16b, v10.16b, #6*2
418            umlal       v14.4s, v12.4h, v2.h[7]
419            umlal2      v15.4s, v12.8h, v2.h[7]
420            umlal       v14.4s, v13.4h, v2.h[7]
421            umlal2      v15.4s, v13.8h, v2.h[7]
422    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
423            ext         v13.16b, v9.16b, v10.16b, #5*2
424            umlal       v14.4s, v12.4h, v2.h[6]
425            umlal2      v15.4s, v12.8h, v2.h[6]
426            umlal       v14.4s, v13.4h, v2.h[6]
427            umlal2      v15.4s, v13.8h, v2.h[6]
428    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
429            ext         v13.16b, v9.16b, v10.16b, #4*2
430            umlal       v14.4s, v12.4h, v2.h[5]
431            umlal2      v15.4s, v12.8h, v2.h[5]
432            umlal       v14.4s, v13.4h, v2.h[5]
433            umlal2      v15.4s, v13.8h, v2.h[5]
434    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
435            ext         v13.16b, v9.16b, v10.16b, #3*2
436            umlal       v14.4s, v12.4h, v2.h[4]
437            umlal2      v15.4s, v12.8h, v2.h[4]
438            umlal       v14.4s, v13.4h, v2.h[4]
439            umlal2      v15.4s, v13.8h, v2.h[4]
440    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
441            ext         v13.16b, v9.16b, v10.16b, #2*2
442            umlal       v14.4s, v12.4h, v2.h[3]
443            umlal2      v15.4s, v12.8h, v2.h[3]
444            umlal       v14.4s, v13.4h, v2.h[3]
445            umlal2      v15.4s, v13.8h, v2.h[3]
446    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
447            ext         v13.16b, v9.16b, v10.16b, #1*2
448            umlal       v14.4s, v12.4h, v2.h[2]
449            umlal2      v15.4s, v12.8h, v2.h[2]
450            umlal       v14.4s, v13.4h, v2.h[2]
451            umlal2      v15.4s, v13.8h, v2.h[2]
452    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
453            ext         v13.16b, v9.16b, v10.16b, #0*2
454            umlal       v14.4s, v12.4h, v2.h[1]
455            umlal2      v15.4s, v12.8h, v2.h[1]
456            umlal       v14.4s, v13.4h, v2.h[1]
457            umlal2      v15.4s, v13.8h, v2.h[1]
458    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
459            ext         v13.16b, v8.16b, v9.16b, #7*2
460            umlal       v14.4s, v12.4h, v2.h[0]
461            umlal2      v15.4s, v12.8h, v2.h[0]
462            umlal       v14.4s, v13.4h, v2.h[0]
463            umlal2      v15.4s, v13.8h, v2.h[0]
464    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
465            ext         v13.16b, v8.16b, v9.16b, #6*2
466            umlal       v14.4s, v12.4h, v1.h[7]
467            umlal2      v15.4s, v12.8h, v1.h[7]
468            umlal       v14.4s, v13.4h, v1.h[7]
469            umlal2      v15.4s, v13.8h, v1.h[7]
470    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
471            ext         v13.16b, v8.16b, v9.16b, #5*2
472            umlal       v14.4s, v12.4h, v1.h[6]
473            umlal2      v15.4s, v12.8h, v1.h[6]
474            umlal       v14.4s, v13.4h, v1.h[6]
475            umlal2      v15.4s, v13.8h, v1.h[6]
476    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
477            ext         v13.16b, v8.16b, v9.16b, #4*2
478            umlal       v14.4s, v12.4h, v1.h[5]
479            umlal2      v15.4s, v12.8h, v1.h[5]
480            umlal       v14.4s, v13.4h, v1.h[5]
481            umlal2      v15.4s, v13.8h, v1.h[5]
482    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
483            ext         v13.16b, v8.16b, v9.16b, #3*2
484            umlal       v14.4s, v12.4h, v1.h[4]
485            umlal2      v15.4s, v12.8h, v1.h[4]
486            umlal       v14.4s, v13.4h, v1.h[4]
487            umlal2      v15.4s, v13.8h, v1.h[4]
488    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
489            ext         v13.16b, v8.16b, v9.16b, #2*2
490            umlal       v14.4s, v12.4h, v1.h[3]
491            umlal2      v15.4s, v12.8h, v1.h[3]
492            umlal       v14.4s, v13.4h, v1.h[3]
493            umlal2      v15.4s, v13.8h, v1.h[3]
494    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
495            ext         v13.16b, v8.16b, v9.16b, #1*2
496            umlal       v14.4s, v12.4h, v1.h[2]
497            umlal2      v15.4s, v12.8h, v1.h[2]
498            umlal       v14.4s, v13.4h, v1.h[2]
499            umlal2      v15.4s, v13.8h, v1.h[2]
500    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
501            ext         v13.16b, v8.16b, v9.16b, #0*2
502            umlal       v14.4s, v12.4h, v1.h[1]
503            umlal2      v15.4s, v12.8h, v1.h[1]
504            umlal       v14.4s, v13.4h, v1.h[1]
505            umlal2      v15.4s, v13.8h, v1.h[1]
506    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
507            ext         v13.16b, v7.16b, v8.16b, #7*2
508            umlal       v14.4s, v12.4h, v1.h[0]
509            umlal2      v15.4s, v12.8h, v1.h[0]
510            umlal       v14.4s, v13.4h, v1.h[0]
511            umlal2      v15.4s, v13.8h, v1.h[0]
512    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
513            ext         v13.16b, v7.16b, v8.16b, #6*2
514            umlal       v14.4s, v12.4h, v0.h[7]
515            umlal2      v15.4s, v12.8h, v0.h[7]
516            umlal       v14.4s, v13.4h, v0.h[7]
517            umlal2      v15.4s, v13.8h, v0.h[7]
518    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
519            ext         v13.16b, v7.16b, v8.16b, #5*2
520            umlal       v14.4s, v12.4h, v0.h[6]
521            umlal2      v15.4s, v12.8h, v0.h[6]
522            umlal       v14.4s, v13.4h, v0.h[6]
523            umlal2      v15.4s, v13.8h, v0.h[6]
524    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
525            ext         v13.16b, v7.16b, v8.16b, #4*2
526            umlal       v14.4s, v12.4h, v0.h[5]
527            umlal2      v15.4s, v12.8h, v0.h[5]
528            umlal       v14.4s, v13.4h, v0.h[5]
529            umlal2      v15.4s, v13.8h, v0.h[5]
530    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
531            ext         v13.16b, v7.16b, v8.16b, #3*2
532            umlal       v14.4s, v12.4h, v0.h[4]
533            umlal2      v15.4s, v12.8h, v0.h[4]
534            umlal       v14.4s, v13.4h, v0.h[4]
535            umlal2      v15.4s, v13.8h, v0.h[4]
536    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
537            ext         v13.16b, v7.16b, v8.16b, #2*2
538            umlal       v14.4s, v12.4h, v0.h[3]
539            umlal2      v15.4s, v12.8h, v0.h[3]
540            umlal       v14.4s, v13.4h, v0.h[3]
541            umlal2      v15.4s, v13.8h, v0.h[3]
542    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
543            ext         v13.16b, v7.16b, v8.16b, #1*2
544            umlal       v14.4s, v12.4h, v0.h[2]
545            umlal2      v15.4s, v12.8h, v0.h[2]
546            umlal       v14.4s, v13.4h, v0.h[2]
547            umlal2      v15.4s, v13.8h, v0.h[2]
548    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
549            ext         v13.16b, v7.16b, v8.16b, #0*2
550            umlal       v14.4s, v12.4h, v0.h[1]
551            umlal2      v15.4s, v12.8h, v0.h[1]
552            umlal       v14.4s, v13.4h, v0.h[1]
553            umlal2      v15.4s, v13.8h, v0.h[1]
554
555            uqrshrn     v14.4h, v14.4s, #16
556            uqrshrn2    v14.8h, v15.4s, #16
557            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
558
559            mov         v31.16b, v4.16b
560            mov         v4.16b, v5.16b
561            mov         v5.16b, v6.16b
562            mov         v6.16b, v7.16b
563            mov         v7.16b, v8.16b
564            mov         v8.16b, v9.16b
565            mov         v9.16b, v10.16b
566            mov         v10.16b, v11.16b
567.endm/*}}}*/
568
569#define TUNED_LIST4 6, 12, 20
570.macro hconv4_6/*{{{*/
571            umull       v14.4s, v7.4h, v0.h[0]
572            umull2      v15.4s, v7.8h, v0.h[0]
573
574            adr         x16, 100f
575            ldrsh       x12, [x16, x5, LSL #1]
576            add         x12, x12, x16
577            br          x12
578   100:     .hword -4
579            .hword 101f-100b
580            .hword 102f-100b
581            .hword 103f-100b
582            .hword 104f-100b
583            .hword 105f-100b
584            .hword 106f-100b
585            .align      4
586    106:    umlal       v14.4s, v4.4h,  v0.h[6]
587            umlal2      v15.4s, v4.8h,  v0.h[6]
588            umlal       v14.4s, v10.4h, v0.h[6]
589            umlal2      v15.4s, v10.8h, v0.h[6]
590    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
591            umlal       v15.4s, v5.4h, v0.h[5]
592            umlal2      v14.4s, v9.8h, v0.h[5]
593            umlal       v15.4s, v10.4h, v0.h[5]
594    104:    umlal       v14.4s, v5.4h, v0.h[4]
595            umlal2      v15.4s, v5.8h, v0.h[4]
596            umlal       v14.4s, v9.4h, v0.h[4]
597            umlal2      v15.4s, v9.8h, v0.h[4]
598    103:    umlal2      v14.4s, v5.8h, v0.h[3]
599            umlal       v15.4s, v6.4h, v0.h[3]
600            umlal2      v14.4s, v8.8h, v0.h[3]
601            umlal       v15.4s, v9.4h, v0.h[3]
602    102:    umlal       v14.4s, v6.4h, v0.h[2]
603            umlal2      v15.4s, v6.8h, v0.h[2]
604            umlal       v14.4s, v8.4h, v0.h[2]
605            umlal2      v15.4s, v8.8h, v0.h[2]
606    101:    umlal2      v14.4s, v6.8h, v0.h[1]
607            umlal       v15.4s, v7.4h, v0.h[1]
608            umlal2      v14.4s, v7.8h, v0.h[1]
609            umlal       v15.4s, v8.4h, v0.h[1]
610
611            uqrshrn     v14.4h, v14.4s, #16
612            uqrshrn2    v14.8h, v15.4s, #16
613            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
614
615            mov         v4.16b, v5.16b
616            mov         v5.16b, v6.16b
617            mov         v6.16b, v7.16b
618            mov         v7.16b, v8.16b
619            mov         v8.16b, v9.16b
620            mov         v9.16b, v10.16b
621            mov         v10.16b, v11.16b
622.endm/*}}}*/
623
624.macro hconv4_12/*{{{*/
625            umull       v14.4s, v4.4h, v0.h[0]
626            umull2      v15.4s, v4.8h, v0.h[0]
627
628            adr         x16, 100f
629            ldrsh       x12, [x16, x5, LSL #1]
630            add         x12, x12, x16
631            br          x12
632   100:     .hword -4
633            .hword 101f-100b
634            .hword 102f-100b
635            .hword 103f-100b
636            .hword 104f-100b
637            .hword 105f-100b
638            .hword 106f-100b
639            .hword 107f-100b
640            .hword 108f-100b
641            .hword 109f-100b
642            .hword 110f-100b
643            .hword 111f-100b
644            .hword 112f-100b
645            .align 4
646    112:    umlal       v14.4s, v26.4h, v1.h[4]
647            umlal2      v15.4s, v26.8h, v1.h[4]
648            umlal       v14.4s, v10.4h, v1.h[4]
649            umlal2      v15.4s, v10.8h, v1.h[4]
650    111:    umlal2      v14.4s, v26.8h, v1.h[3]
651            umlal       v15.4s, v27.4h, v1.h[3]
652            umlal2      v14.4s, v9.8h, v1.h[3]
653            umlal       v15.4s, v10.4h, v1.h[3]
654    110:    umlal       v14.4s, v27.4h, v1.h[2]
655            umlal2      v15.4s, v27.8h, v1.h[2]
656            umlal       v14.4s, v9.4h, v1.h[2]
657            umlal2      v15.4s, v9.8h, v1.h[2]
658    109:    umlal2      v14.4s, v27.8h, v1.h[1]
659            umlal       v15.4s, v28.4h, v1.h[1]
660            umlal2      v14.4s, v8.8h, v1.h[1]
661            umlal       v15.4s, v9.4h, v1.h[1]
662    108:    umlal       v14.4s, v28.4h, v1.h[0]
663            umlal2      v15.4s, v28.8h, v1.h[0]
664            umlal       v14.4s, v8.4h, v1.h[0]
665            umlal2      v15.4s, v8.8h, v1.h[0]
666    107:    umlal2      v14.4s, v28.8h, v0.h[7]
667            umlal       v15.4s, v29.4h, v0.h[7]
668            umlal2      v14.4s, v7.8h, v0.h[7]
669            umlal       v15.4s, v8.4h, v0.h[7]
670    106:    umlal       v14.4s, v29.4h, v0.h[6]
671            umlal2      v15.4s, v29.8h, v0.h[6]
672            umlal       v14.4s, v7.4h, v0.h[6]
673            umlal2      v15.4s, v7.8h, v0.h[6]
674    105:    umlal2      v14.4s, v29.8h, v0.h[5]
675            umlal       v15.4s, v30.4h, v0.h[5]
676            umlal2      v14.4s, v6.8h, v0.h[5]
677            umlal       v15.4s, v7.4h, v0.h[5]
678    104:    umlal       v14.4s, v30.4h, v0.h[4]
679            umlal2      v15.4s, v30.8h, v0.h[4]
680            umlal       v14.4s, v6.4h, v0.h[4]
681            umlal2      v15.4s, v6.8h, v0.h[4]
682    103:    umlal2      v14.4s, v30.8h, v0.h[3]
683            umlal       v15.4s, v31.4h, v0.h[3]
684            umlal2      v14.4s, v5.8h, v0.h[3]
685            umlal       v15.4s, v6.4h, v0.h[3]
686    102:    umlal       v14.4s, v31.4h, v0.h[2]
687            umlal2      v15.4s, v31.8h, v0.h[2]
688            umlal       v14.4s, v5.4h, v0.h[2]
689            umlal2      v15.4s, v5.8h, v0.h[2]
690    101:    umlal2      v14.4s, v31.8h, v0.h[1]
691            umlal       v15.4s, v4.4h,  v0.h[1]
692            umlal2      v14.4s, v4.8h,  v0.h[1]
693            umlal       v15.4s, v5.4h, v0.h[1]
694
695            uqrshrn     v14.4h, v14.4s, #16
696            uqrshrn2    v14.8h, v15.4s, #16
697            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
698
699            mov         v26.16b, v27.16b
700            mov         v27.16b, v28.16b
701            mov         v28.16b, v29.16b
702            mov         v29.16b, v30.16b
703            mov         v30.16b, v31.16b
704            mov         v31.16b, v4.16b
705            mov         v4.16b, v5.16b
706            mov         v5.16b, v6.16b
707            mov         v6.16b, v7.16b
708            mov         v7.16b, v8.16b
709            mov         v8.16b, v9.16b
710            mov         v9.16b, v10.16b
711            mov         v10.16b, v11.16b
712.endm/*}}}*/
713
714.macro hconv4_20/*{{{*/
715            umull       v14.4s, v28.4h, v0.h[0]
716            umull2      v15.4s, v28.8h, v0.h[0]
717
718            adr         x16, 100f
719            ldrsh       x12, [x16, x5, LSL #1]
720            add         x12, x12, x16
721            br          x12
722   100:     .hword -4
723            .hword 101f-100b
724            .hword 102f-100b
725            .hword 103f-100b
726            .hword 104f-100b
727            .hword 105f-100b
728            .hword 106f-100b
729            .hword 107f-100b
730            .hword 108f-100b
731            .hword 109f-100b
732            .hword 110f-100b
733            .hword 111f-100b
734            .hword 112f-100b
735            .hword 113f-100b
736            .hword 114f-100b
737            .hword 115f-100b
738            .hword 116f-100b
739            .hword 117f-100b
740            .hword 118f-100b
741            .hword 119f-100b
742            .hword 120f-100b
743            .align 4
744
745    120:    umlal       v14.4s, v18.4h, v2.h[4]
746            umlal2      v15.4s, v18.8h, v2.h[4]
747            umlal       v14.4s, v10.4h, v2.h[4]
748            umlal2      v15.4s, v10.8h, v2.h[4]
749    119:    umlal2      v14.4s, v18.8h, v2.h[3]
750            umlal       v15.4s, v19.4h, v2.h[3]
751            umlal2      v14.4s, v9.8h,  v2.h[3]
752            umlal       v15.4s, v10.4h, v2.h[3]
753    118:    umlal       v14.4s, v19.4h, v2.h[2]
754            umlal2      v15.4s, v19.8h, v2.h[2]
755            umlal       v14.4s, v9.4h,  v2.h[2]
756            umlal2      v15.4s, v9.8h,  v2.h[2]
757    117:    umlal2      v14.4s, v19.8h, v2.h[1]
758            umlal       v15.4s, v20.4h, v2.h[1]
759            umlal2      v14.4s, v8.8h,  v2.h[1]
760            umlal       v15.4s, v9.4h,  v2.h[1]
761    116:    umlal       v14.4s, v20.4h, v2.h[0]
762            umlal2      v15.4s, v20.8h, v2.h[0]
763            umlal       v14.4s, v8.4h,  v2.h[0]
764            umlal2      v15.4s, v8.8h,  v2.h[0]
765    115:    umlal2      v14.4s, v20.8h, v1.h[7]
766            umlal       v15.4s, v21.4h, v1.h[7]
767            umlal2      v14.4s, v7.8h,  v1.h[7]
768            umlal       v15.4s, v8.4h,  v1.h[7]
769    114:    umlal       v14.4s, v21.4h, v1.h[6]
770            umlal2      v15.4s, v21.8h, v1.h[6]
771            umlal       v14.4s, v7.4h,  v1.h[6]
772            umlal2      v15.4s, v7.8h,  v1.h[6]
773    113:    umlal2      v14.4s, v21.8h, v1.h[5]
774            umlal       v15.4s, v22.4h, v1.h[5]
775            umlal2      v14.4s, v6.8h,  v1.h[5]
776            umlal       v15.4s, v7.4h,  v1.h[5]
777    112:    umlal       v14.4s, v22.4h, v1.h[4]
778            umlal2      v15.4s, v22.8h, v1.h[4]
779            umlal       v14.4s, v6.4h,  v1.h[4]
780            umlal2      v15.4s, v6.8h,  v1.h[4]
781    111:    umlal2      v14.4s, v22.8h, v1.h[3]
782            umlal       v15.4s, v23.4h, v1.h[3]
783            umlal2      v14.4s, v5.8h,  v1.h[3]
784            umlal       v15.4s, v6.4h,  v1.h[3]
785    110:    umlal       v14.4s, v23.4h, v1.h[2]
786            umlal2      v15.4s, v23.8h, v1.h[2]
787            umlal       v14.4s, v5.4h,  v1.h[2]
788            umlal2      v15.4s, v5.8h,  v1.h[2]
789    109:    umlal2      v14.4s, v23.8h, v1.h[1]
790            umlal       v15.4s, v24.4h, v1.h[1]
791            umlal2      v14.4s, v4.8h,  v1.h[1]
792            umlal       v15.4s, v5.4h,  v1.h[1]
793    108:    umlal       v14.4s, v24.4h, v1.h[0]
794            umlal2      v15.4s, v24.8h, v1.h[0]
795            umlal       v14.4s, v4.4h,  v1.h[0]
796            umlal2      v15.4s, v4.8h,  v1.h[0]
797    107:    umlal2      v14.4s, v24.8h, v0.h[7]
798            umlal       v15.4s, v25.4h, v0.h[7]
799            umlal2      v14.4s, v31.8h, v0.h[7]
800            umlal       v15.4s, v4.4h,  v0.h[7]
801    106:    umlal       v14.4s, v25.4h, v0.h[6]
802            umlal2      v15.4s, v25.8h, v0.h[6]
803            umlal       v14.4s, v31.4h, v0.h[6]
804            umlal2      v15.4s, v31.8h, v0.h[6]
805    105:    umlal2      v14.4s, v25.8h, v0.h[5]
806            umlal       v15.4s, v26.4h, v0.h[5]
807            umlal2      v14.4s, v30.8h, v0.h[5]
808            umlal       v15.4s, v31.4h, v0.h[5]
809    104:    umlal       v14.4s, v26.4h, v0.h[4]
810            umlal2      v15.4s, v26.8h, v0.h[4]
811            umlal       v14.4s, v30.4h, v0.h[4]
812            umlal2      v15.4s, v30.8h, v0.h[4]
813    103:    umlal2      v14.4s, v26.8h, v0.h[3]
814            umlal       v15.4s, v27.4h, v0.h[3]
815            umlal2      v14.4s, v29.8h, v0.h[3]
816            umlal       v15.4s, v30.4h, v0.h[3]
817    102:    umlal       v14.4s, v27.4h, v0.h[2]
818            umlal2      v15.4s, v27.8h, v0.h[2]
819            umlal       v14.4s, v29.4h, v0.h[2]
820            umlal2      v15.4s, v29.8h, v0.h[2]
821    101:    umlal2      v14.4s, v27.8h, v0.h[1]
822            umlal       v15.4s, v28.4h, v0.h[1]
823            umlal2      v14.4s, v28.8h, v0.h[1]
824            umlal       v15.4s, v29.4h, v0.h[1]
825
826            uqrshrn     v14.4h, v14.4s, #16
827            uqrshrn2    v14.8h, v15.4s, #16
828            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
829
830            mov         v18.16b, v19.16b
831            mov         v19.16b, v20.16b
832            mov         v20.16b, v21.16b
833            mov         v21.16b, v22.16b
834            mov         v22.16b, v23.16b
835            mov         v23.16b, v24.16b
836            mov         v24.16b, v25.16b
837            mov         v25.16b, v26.16b
838            mov         v26.16b, v27.16b
839            mov         v27.16b, v28.16b
840            mov         v28.16b, v29.16b
841            mov         v29.16b, v30.16b
842            mov         v30.16b, v31.16b
843            mov         v31.16b, v4.16b
844            mov         v4.16b, v5.16b
845            mov         v5.16b, v6.16b
846            mov         v6.16b, v7.16b
847            mov         v7.16b, v8.16b
848            mov         v8.16b, v9.16b
849            mov         v9.16b, v10.16b
850            mov         v10.16b, v11.16b
851.endm/*}}}*/
852
853.macro hconv4_25/*{{{*/
854            umull2      v14.4s, v25.8h, v0.h[0]
855            umull       v15.4s, v26.4h, v0.h[0]
856
857            adr         x16, 100f
858            ldrsh       x12, [x16, x5, LSL #1]
859            add         x12, x12, x16
860            br          x12
861   100:     .hword -4
862            .hword 101f-100b
863            .hword 102f-100b
864            .hword 103f-100b
865            .hword 104f-100b
866            .hword 105f-100b
867            .hword 106f-100b
868            .hword 107f-100b
869            .hword 108f-100b
870            .hword 109f-100b
871            .hword 110f-100b
872            .hword 111f-100b
873            .hword 112f-100b
874            .hword 113f-100b
875            .hword 114f-100b
876            .hword 115f-100b
877            .hword 116f-100b
878            .hword 117f-100b
879            .hword 118f-100b
880            .hword 119f-100b
881            .hword 120f-100b
882            .hword 121f-100b
883            .hword 122f-100b
884            .hword 123f-100b
885            .hword 124f-100b
886            .hword 125f-100b
887            .align 4
888
889    125:    ld1         {v12.8h}, [x9]
890            umlal       v14.4s, v12.4h, v3.h[1]
891            umlal2      v15.4s, v12.8h, v3.h[1]
892            umlal       v14.4s, v10.4h, v3.h[1]
893            umlal2      v15.4s, v10.8h, v3.h[1]
894    124:    add         x12, x9, #0x08
895            bic         x12, x12, #0x40
896            ld1         {v12.4h}, [x12], #8
897            bic         x12, x12, #0x40
898            ld1         {v13.4h}, [x12]
899            umlal       v14.4s, v12.4h, v3.h[0]
900            umlal       v15.4s, v13.4h, v3.h[0]
901            umlal2      v14.4s, v9.8h,  v3.h[0]
902            umlal       v15.4s, v10.4h, v3.h[0]
903    123:    add         x12, x9, #0x10
904            bic         x12, x12, #0x40
905            ld1         {v12.8h}, [x12]
906            umlal       v14.4s, v12.4h, v2.h[7]
907            umlal2      v15.4s, v12.8h, v2.h[7]
908            umlal       v14.4s, v9.4h,  v2.h[7]
909            umlal2      v15.4s, v9.8h,  v2.h[7]
910    122:    add         x12, x9, #0x18
911            bic         x12, x12, #0x40
912            ld1         {v12.4h}, [x12], #8
913            bic         x12, x12, #0x40
914            ld1         {v13.4h}, [x12]
915            umlal       v14.4s, v12.4h, v2.h[6]
916            umlal       v15.4s, v13.4h, v2.h[6]
917            umlal2      v14.4s, v8.8h,  v2.h[6]
918            umlal       v15.4s, v9.4h,  v2.h[6]
919    121:    add         x12, x9, #0x20
920            bic         x12, x12, #0x40
921            ld1         {v12.8h}, [x12]
922            umlal       v14.4s, v12.4h, v2.h[5]
923            umlal2      v15.4s, v12.8h, v2.h[5]
924            umlal       v14.4s, v8.4h,  v2.h[5]
925            umlal2      v15.4s, v8.8h,  v2.h[5]
926    120:    add         x12, x9, #0x28
927            bic         x12, x12, #0x40
928            ld1         {v12.4h}, [x12], #8
929            bic         x12, x12, #0x40
930            ld1         {v13.4h}, [x12]
931            umlal       v14.4s, v12.4h, v2.h[4]
932            umlal       v15.4s, v13.4h, v2.h[4]
933            umlal2      v14.4s, v7.8h,  v2.h[4]
934            umlal       v15.4s, v8.4h,  v2.h[4]
935    119:    add         x12, x9, #0x30
936            bic         x12, x12, #0x40
937            ld1         {v12.8h}, [x12]
938            umlal       v14.4s, v12.4h, v2.h[3]
939            umlal2      v15.4s, v12.8h, v2.h[3]
940            umlal       v14.4s, v7.4h,  v2.h[3]
941            umlal2      v15.4s, v7.8h,  v2.h[3]
942    118:    add         x12, x9, #0x38
943            bic         x12, x12, #0x40
944            ld1         {v12.4h}, [x12]
945            umlal       v14.4s, v12.4h, v2.h[2]
946            umlal       v15.4s, v17.4h, v2.h[2]
947            umlal2      v14.4s, v6.8h,  v2.h[2]
948            umlal       v15.4s, v7.4h,  v2.h[2]
949    117:    umlal       v14.4s, v17.4h, v2.h[1]
950            umlal2      v15.4s, v17.8h, v2.h[1]
951            umlal       v14.4s, v6.4h,  v2.h[1]
952            umlal2      v15.4s, v6.8h,  v2.h[1]
953    116:    umlal2      v14.4s, v17.8h, v2.h[0]
954            umlal       v15.4s, v18.4h, v2.h[0]
955            umlal2      v14.4s, v5.8h,  v2.h[0]
956            umlal       v15.4s, v6.4h,  v2.h[0]
957    115:    umlal       v14.4s, v18.4h, v1.h[7]
958            umlal2      v15.4s, v18.8h, v1.h[7]
959            umlal       v14.4s, v5.4h,  v1.h[7]
960            umlal2      v15.4s, v5.8h,  v1.h[7]
961    114:    umlal2      v14.4s, v18.8h, v1.h[6]
962            umlal       v15.4s, v19.4h, v1.h[6]
963            umlal2      v14.4s, v4.8h,  v1.h[6]
964            umlal       v15.4s, v5.4h,  v1.h[6]
965    113:    umlal       v14.4s, v19.4h, v1.h[5]
966            umlal2      v15.4s, v19.8h, v1.h[5]
967            umlal       v14.4s, v4.4h,  v1.h[5]
968            umlal2      v15.4s, v4.8h,  v1.h[5]
969    112:    umlal2      v14.4s, v19.8h, v1.h[4]
970            umlal       v15.4s, v20.4h, v1.h[4]
971            umlal2      v14.4s, v31.8h, v1.h[4]
972            umlal       v15.4s, v4.4h,  v1.h[4]
973    111:    umlal       v14.4s, v20.4h, v1.h[3]
974            umlal2      v15.4s, v20.8h, v1.h[3]
975            umlal       v14.4s, v31.4h, v1.h[3]
976            umlal2      v15.4s, v31.8h, v1.h[3]
977    110:    umlal2      v14.4s, v20.8h, v1.h[2]
978            umlal       v15.4s, v21.4h, v1.h[2]
979            umlal2      v14.4s, v30.8h, v1.h[2]
980            umlal       v15.4s, v31.4h, v1.h[2]
981    109:    umlal       v14.4s, v21.4h, v1.h[1]
982            umlal2      v15.4s, v21.8h, v1.h[1]
983            umlal       v14.4s, v30.4h, v1.h[1]
984            umlal2      v15.4s, v30.8h, v1.h[1]
985    108:    umlal2      v14.4s, v21.8h, v1.h[0]
986            umlal       v15.4s, v22.4h, v1.h[0]
987            umlal2      v14.4s, v29.8h, v1.h[0]
988            umlal       v15.4s, v30.4h, v1.h[0]
989    107:    umlal       v14.4s, v22.4h, v0.h[7]
990            umlal2      v15.4s, v22.8h, v0.h[7]
991            umlal       v14.4s, v29.4h, v0.h[7]
992            umlal2      v15.4s, v29.8h, v0.h[7]
993    106:    umlal2      v14.4s, v22.8h, v0.h[6]
994            umlal       v15.4s, v23.4h, v0.h[6]
995            umlal2      v14.4s, v28.8h, v0.h[6]
996            umlal       v15.4s, v29.4h, v0.h[6]
997    105:    umlal       v14.4s, v23.4h, v0.h[5]
998            umlal2      v15.4s, v23.8h, v0.h[5]
999            umlal       v14.4s, v28.4h, v0.h[5]
1000            umlal2      v15.4s, v28.8h, v0.h[5]
1001    104:    umlal2      v14.4s, v23.8h, v0.h[4]
1002            umlal       v15.4s, v24.4h, v0.h[4]
1003            umlal2      v14.4s, v27.8h, v0.h[4]
1004            umlal       v15.4s, v28.4h, v0.h[4]
1005    103:    umlal       v14.4s, v24.4h, v0.h[3]
1006            umlal2      v15.4s, v24.8h, v0.h[3]
1007            umlal       v14.4s, v27.4h, v0.h[3]
1008            umlal2      v15.4s, v27.8h, v0.h[3]
1009    102:    umlal2      v14.4s, v24.8h, v0.h[2]
1010            umlal       v15.4s, v25.4h, v0.h[2]
1011            umlal2      v14.4s, v26.8h, v0.h[2]
1012            umlal       v15.4s, v27.4h, v0.h[2]
1013    101:    umlal       v14.4s, v25.4h, v0.h[1]
1014            umlal2      v15.4s, v25.8h, v0.h[1]
1015            umlal       v14.4s, v26.4h, v0.h[1]
1016            umlal2      v15.4s, v26.8h, v0.h[1]
1017
1018            uqrshrn     v14.4h, v14.4s, #16
1019            uqrshrn2    v14.8h, v15.4s, #16
1020            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
1021
1022            st1         {v17.16b}, [x9], #16
1023            bic         x9, x9, #0x40
1024            mov         v17.16b, v18.16b
1025            mov         v18.16b, v19.16b
1026            mov         v19.16b, v20.16b
1027            mov         v20.16b, v21.16b
1028            mov         v21.16b, v22.16b
1029            mov         v22.16b, v23.16b
1030            mov         v23.16b, v24.16b
1031            mov         v24.16b, v25.16b
1032            mov         v25.16b, v26.16b
1033            mov         v26.16b, v27.16b
1034            mov         v27.16b, v28.16b
1035            mov         v28.16b, v29.16b
1036            mov         v29.16b, v30.16b
1037            mov         v30.16b, v31.16b
1038            mov         v31.16b, v4.16b
1039            mov         v4.16b, v5.16b
1040            mov         v5.16b, v6.16b
1041            mov         v6.16b, v7.16b
1042            mov         v7.16b, v8.16b
1043            mov         v8.16b, v9.16b
1044            mov         v9.16b, v10.16b
1045            mov         v10.16b, v11.16b
1046.endm/*}}}*/
1047
1048/* Dedicated function wrapper for the fetch macro, for the cases where
1049 * performance isn't that important, to keep code size down.
1050 */
1051PRIVATE(fetch_generic_asm)
1052            stp         x10, x11, [sp, #-16]!
1053            fetch
1054            ldp         x10, x11, [sp], #16
1055            ret
1056END(fetch_generic_asm)
1057
1058/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
1059 * across to fill the rest of the register pair.  Used for filling the right
1060 * hand edge of the window when starting too close to the right hand edge of
1061 * the image.
1062 */
1063PRIVATE(prefetch_clamp1)
1064            sub         x11, xzr, x11
1065            sub         x15, x15, x1
1066            sub         x19, x19, x1
1067            tbz         x11, #3, 1f
1068            mov         v11.16b, v10.16b
1069            sub         x1, x1, #16
10701:          mov         v12.16b, v11.16b
1071            movi        v13.8b, #0xff
1072            tbz         x11, #2, 1f
1073            ext         v12.16b, v12.16b, v12.16b, #4*2
1074            sub         x1, x1, #8
1075            shl         v13.2d, v13.2d, #32
10761:          tbz         x11, #1, 1f
1077            ext         v12.16b, v12.16b, v12.16b, #6*2
1078            sub         x1, x1, #4
1079            shl         v13.2d, v13.2d, #16
10801:          tbz         x11, #0, 1f
1081            ext         v12.16b, v12.16b, v12.16b, #7*2
1082            sub         x1, x1, #2
1083            shl         v13.2d, v13.2d, #8
10841:          dup         v12.8h, v12.h[6]
1085            sxtl        v13.8h, v13.8b
1086            bif         v11.16b, v12.16b, v13.16b
10871:          tbz         x11, #3, 1f
1088            mov         v10.16b, v11.16b
1089            mov         v11.16b, v12.16b
10901:          sub         x11, xzr, x11
1091            add         x15, x15, x1
1092            add         x19, x19, x1
1093            ret
1094END(prefetch_clamp1)
1095
1096PRIVATE(prefetch_clamp4)
1097            sub         x11, xzr, x11
1098            sub         x15, x15, x1
1099            sub         x19, x19, x1
1100            tbz         x11, #3, 1f
1101            sub         x1, x1, #16     // what's this?
1102            mov         v11.16b, v10.16b
11031:          dup         v12.2d, v11.d[1]
1104            tbz         x11, #2, 1f
1105            dup         v12.2d, v11.d[0]
1106            sub         x1, x1, #8
1107            dup         v11.2d, v11.d[0]
11081:          tbz         x11, #3, 1f
1109            mov         v10.16b, v11.16b
1110            mov         v11.16b, v12.16b
11111:          sub         x11, xzr, x11
1112            add         x15, x15, x1
1113            add         x19, x19, x1
1114            ret
1115END(prefetch_clamp4)
1116
1117
1118/* Helpers for prefetch, below.
1119 */
1120.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi
1121  .if \store == 2
1122    .ifc \qsa,\qsb
1123            st1         {\qsa}, [x9], #16
1124            st1         {\qsb}, [x9], #16
1125    .else
1126            st1         {\qsa,\qsb}, [x9], #32
1127    .endif
1128  .elseif \store == 1
1129            bic         x9, x9, #0x40
1130            st1         {\qsa}, [x9], #16
1131            mov         \qb, \qsb
1132  .elseif \store == 0
1133            mov         \qa, \qsa
1134            mov         \qb, \qsb
1135  .endif
1136.endm
1137
1138.macro prefetch_one  qa, qb, rem, c, store=0, step=1
1139.set i, (need - 16) - \rem
1140.if i >= 0
11411:          cmp         x10, #i+16
1142            blo         2f
1143            prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1]
1144            b           1f
11452:          cmp         x11, #i+16
1146            bls         3f
1147            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
1148            bl          fetch_generic_asm
1149            b           2f
11503:          bl          prefetch_clamp\step
1151            prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
11524:          b           4f+4
1153           //v12 contains pad word from prefetch_clamp call
1154            prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
1155  .if \rem > 0
1156            b           4f+4
1157  .else
11581:
11592:
11603:
11614:          nop
1162  .endif
1163.endif
1164.endm
1165
1166/* Fill the convolution window with context data.  The aim here is to load
1167 * exactly rlf + rrt columns, and in the main loop to read as many columns as
1168 * will be written.  This is complicated by the need to handle cases when the
1169 * input starts very close to the left or right (or both) edges of the image,
1170 * and where these do not fall on 16-byte boundaries.
1171 *
1172 * Input:
1173 *      x1 -- src
1174 *      x2 -- pitch
1175 *      x3 -- count
1176 *      x4 -- inlen
1177 *      x5 -- r
1178 *      x6 -- rup
1179 *      x7 -- rdn
1180 *      x8 -- rlf
1181 *      x9 -- buffer (if needed)
1182 *      x13 = -pitch
1183 *      x15 = top-row in
1184 *      x19 = bottom-row in
1185 * Output:
1186 *      x1 += rlf + min(count, rrt)
1187 * Modifies:
1188 *      x10 -- fill start index in the window
1189 *      x11 -- fill stop index in the window
1190 *      x12 -- scratch
1191 */
1192.macro prefetch step=1, max_r=25
1193.set need, ((\max_r + \max_r) * \step + 15) & ~15
1194  .if \step == 1
1195            mov         x10, #need - (\max_r * \step)
1196            sub         x10, x10, x8
1197  .else
1198            mov         x10, #need - (\max_r * \step)
1199            sub         x10, x10, x8, LSL #2
1200  .endif
1201            add         x11, x10, x4
1202            subs        x11, x11, #need
1203            csel        x11, xzr, x11, hi
1204            add         x11, x11, #need
1205
1206            bl          fetch_generic_asm
1207  .if \step == 1
1208            dup         v9.8h, v10.h[0]
1209  .else
1210            dup         v9.2d, v10.d[0]
1211  .endif
1212            tst         x10, #15
1213            beq         2f
1214            sub         x12, xzr, x10
1215            tbz         x10, #3, 1f
1216            mov         v11.16b, v10.16b
1217            mov         v10.16b, v9.16b
12181:          tbz         x12, #2, 1f
1219            ext         v11.16b, v10.16b, v11.16b, #4*2
1220            ext         v10.16b, v9.16b, v10.16b, #4*2
1221  .if \step == 1
1222  1:        tbz         x12, #1, 1f
1223            ext         v11.16b, v10.16b, v11.16b, #2*2
1224            ext         v10.16b, v9.16b, v10.16b, #2*2
1225  1:        tbz         x12, #0, 1f
1226            ext         v11.16b, v10.16b, v11.16b, #1*2
1227            ext         v10.16b, v9.16b, v10.16b, #1*2
1228  .endif
12291:          sub         x1, x1, x10
1230            sub         x15, x15, x10
1231            sub         x19, x19, x10
1232            bic         x10, x10, #15
1233            add         x1, x1, x10
1234            add         x15, x15, x10
1235            add         x19, x19, x10
12362:
1237  .if \step > 1
1238            /* it's only in the uchar2 and uchar4 cases where the register file
1239             * is insufficient (given MAX_R <= 25).
1240             */
1241            prefetch_one xx, xx, 192, c=\max_r, step=\step, store=2
1242            prefetch_one xx, xx, 176, c=\max_r, step=\step, store=2
1243            prefetch_one xx,      v17.16b, 160, c=\max_r, step=\step, store=1
1244            prefetch_one v18.16b, v19.16b, 144, c=\max_r, step=\step, store=0
1245            prefetch_one v20.16b, v21.16b, 128, c=\max_r, step=\step, store=0
1246            prefetch_one v22.16b, v23.16b, 112, c=\max_r, step=\step, store=0
1247            prefetch_one v24.16b, v25.16b,  96, c=\max_r, step=\step, store=0
1248            prefetch_one v26.16b, v27.16b,  80, c=\max_r, step=\step, store=0
1249            prefetch_one v28.16b, v29.16b,  64, c=\max_r, step=\step, store=0
1250  .endif
1251            prefetch_one v30.16b, v31.16b,  48, c=\max_r, step=\step, store=0
1252            prefetch_one v4.16b,  v5.16b,   32, c=\max_r, step=\step, store=0
1253            prefetch_one v6.16b,  v7.16b,   16, c=\max_r, step=\step, store=0
1254            prefetch_one v8.16b,  v9.16b,    0, c=\max_r, step=\step, store=0
1255
1256  .if \step == 1
1257            add         x10, x8, #\max_r * \step
1258  .else
1259            lsl         x10, x8, #2
1260            add         x10, x10, #\max_r * \step
1261  .endif
1262            subs        x4, x4, x10
1263            csel        x4, xzr, x4, lo
1264.endm
1265
1266/* The main loop.
1267 *
1268 * Input:
1269 *      x0 = dst
1270 *      x1 = src
1271 *      x2 = pitch
1272 *      x3 = count
1273 *      x4 = inlen
1274 *      x5 = r
1275 *      x6 = rup
1276 *      x7 = rdn
1277 *      x9 = buffer
1278 *      x13 = -pitch
1279 *      x15 = top-row in
1280 *      x19 = bottom-row in
1281 * Modifies
1282 *      x8 = fetch code pointer
1283 */
1284.macro mainloop core, step=1, max_r=25, labelc="", labelnc=""
1285            adrp        x8, \labelnc
1286            add         x8, x8, #:lo12:\labelnc
1287            sub         x8, x8, x5, LSL #5
1288            sub         x8, x8, x5, LSL #3
1289            cmp         x5, x6
1290            ccmp        x5, x7, #0, eq
1291            beq         5f
1292
1293            /* if (r != rup || r != rdn) then the address-clamping table should
1294             * be used rather than the short-cut version.
1295             */
1296            adrp        x8, \labelc
1297            add         x8, x8, #:lo12:\labelc
1298            sub         x8, x8, x5, LSL #6
1299            add         x8, x8, x5, LSL #3
1300            b           5f
1301            .align  4
13023:          fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
1303
1304            /* For each call to fetch two are made to \core.  It would be
1305             * preferable to have twice the work done in \core.
1306             */
1307            \core
1308            st1         {v15.8b}, [x0], #8
1309            \core
1310            st1         {v15.8b}, [x0], #8
1311
1312            sub         x3, x3, #16
13135:          subs        x4, x4, #16
1314            bhs         3b
1315            adds        x4, x4, #16
1316            bne         1f
1317  .if \step==1
1318            dup         v10.8h, v9.h[7]
1319            dup         v11.8h, v9.h[7]
1320  .else
1321            dup         v10.2d, v9.d[1]
1322            dup         v11.2d, v9.d[1]
1323  .endif
1324            b           4f
1325
13261:          sub         x1, x1, #16
1327            sub         x15, x15, #16
1328            sub         x19, x19, #16
1329            add         x1, x1, x4
1330            add         x15, x15, x4
1331            add         x19, x19, x4
1332            bl          fetch_generic_asm
1333
1334  .if \step==1
1335            dup         v12.8h, v11.h[7]
1336  .else
1337            dup         v12.2d, v11.d[1]
1338  .endif
1339            sub         x4, xzr, x4
1340            tbz         x4, #3, 1f
1341            mov         v10.16b, v11.16b
1342            mov         v11.16b, v12.16b
13431:          tbz         x4, #2, 1f
1344            ext         v10.16b, v10.16b, v11.16b, #4*2
1345            ext         v11.16b, v11.16b, v12.16b, #4*2
13461:          tbz         x4, #1, 1f
1347            ext         v10.16b, v10.16b, v11.16b, #2*2
1348            ext         v11.16b, v11.16b, v12.16b, #2*2
13491:          tbz         x4, #0, 4f
1350            ext         v10.16b, v10.16b, v11.16b, #1*2
1351            ext         v11.16b, v11.16b, v12.16b, #1*2
13524:          cbz         x3, 5f
13533:          \core
1354  .if \step==1
1355            dup         v11.8h, v11.h[7]
1356  .else
1357            dup         v11.2d, v11.d[1]
1358  .endif
1359            subs        x3, x3, #8
1360            blo         4f
1361            st1         {v15.8b}, [x0], #8
1362            beq         5f
1363            b           3b
13644:          tbz         x3, #2, 1f
1365            st1         {v15.s}[0], [x0], #4
1366            ext         v15.16b, v15.16b, v15.16b, #4*2
13671:          tbz         x3, #1, 1f
1368            st1         {v15.h}[0], [x0], #2
1369            ext         v15.16b, v15.16b, v15.16b, #2*2
13701:          tbz         x3, #0, 5f
1371            st1         {v15.b}[0], [x0], #1
1372            ext         v15.16b, v15.16b, v15.16b, #1*2
13735:          nop
1374.endm
1375
1376.irep r, TUNED_LIST1, 25
1377PRIVATE(convolve1_\r)
1378            stp         x29,x30, [sp, #-16]!
1379
1380            prefetch    step=1, max_r=\r
1381
1382            mainloop    core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
1383
1384            ldp         x29,x30, [sp], #16
1385            ret
1386END(convolve1_\r)
1387.endr
1388
1389.irep r, TUNED_LIST4, 25
1390PRIVATE(convolve4_\r)
1391            sub         x12, sp, #0x040
1392            bic         x9, x12, #0x07f
1393            mov         sp, x9
1394            stp         x12,x30, [sp, #-16]!
1395
1396            /* x9 now points to a buffer on the stack whose address has the low
1397             * 7 bits clear.  This allows easy address calculation in the
1398             * wrap-around cases.
1399             */
1400
1401
1402            prefetch    step=4, max_r=\r
1403
1404            mainloop    core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
1405
1406            ldp         x12,x30, [sp]
1407            add         sp, x12, #0x40
1408            ret
1409END(convolve4_\r)
1410.endr
1411
1412/* void rsdIntrinsicBlurU1_K(
1413 *                  void *out,      // x0
1414 *                  void *in,       // x1
1415 *                  size_t w,       // x2
1416 *                  size_t h,       // x3
1417 *                  size_t p,       // x4
1418 *                  size_t x,       // x5
1419 *                  size_t y,       // x6
1420 *                  size_t count,   // x7
1421 *                  size_t r,       // [sp]
1422 *                  uint16_t *tab); // [sp,#8]
1423 */
1424ENTRY(rsdIntrinsicBlurU1_K)
1425            stp         x19,x30, [sp, #-16]!
1426            sub         x8, sp, #32
1427            sub         sp, sp, #64
1428            st1         {v8.1d - v11.1d}, [sp]
1429            st1         {v12.1d - v15.1d}, [x8]
1430            mov         x8, x5        // x
1431            ldr         w5, [sp,#80]  // r
1432            sub         x9, x2, x8
1433            sub         x10, x3, x6
1434            mov         x2, x4        // pitch
1435            mov         x3, x7        // count
1436            sub         x7, x10, #1
1437            sub         x9, x9, x3
1438
1439            ldr         x12, [sp, #88] // tab
1440
1441            add         x0, x0, x8
1442            add         x1, x1, x8
1443
1444            cmp         x6, x5
1445            csel        x6, x5, x6, hs
1446            cmp         x7, x5
1447            csel        x7, x5, x7, hs
1448            cmp         x8, x5
1449            csel        x8, x5, x8, hs
1450            cmp         x9, x5
1451            csel        x9, x5, x8, hs
1452
1453            add         x4, x8, x9
1454            add         x4, x4, x3
1455
1456            sub         x1, x1, x8
1457
1458            sub         x13, xzr, x2
1459            msub        x15, x2, x6, x1
1460            madd        x19, x2, x7, x1
1461
1462            ld1         {v0.8h,v1.8h}, [x12], #32
1463            ld1         {v2.8h,v3.8h}, [x12], #32
1464
1465            adr         x30, 1f
1466  .irep r, TUNED_LIST1
1467            cmp         x5, #\r
1468            bls         convolve1_\r
1469  .endr
1470            b           convolve1_25
1471
14721:          ld1         {v8.1d - v11.1d}, [sp], #32
1473            ld1         {v12.1d - v15.1d}, [sp], #32
1474            ldp         x19,x30, [sp], #16
1475            ret
1476END(rsdIntrinsicBlurU1_K)
1477
1478/* void rsdIntrinsicBlurU4_K(
1479 *                  void *out,      // x0
1480 *                  void *in,       // x1
1481 *                  size_t w,       // x2
1482 *                  size_t h,       // x3
1483 *                  size_t p,       // x4
1484 *                  size_t x,       // x5
1485 *                  size_t y,       // x6
1486 *                  size_t count,   // x7
1487 *                  size_t r,       // [sp]
1488 *                  uint16_t *tab); // [sp,#8]
1489 */
1490ENTRY(rsdIntrinsicBlurU4_K)
1491            stp         x19,x30, [sp, #-16]!
1492            sub         x8, sp, #32
1493            sub         sp, sp, #64
1494            st1         {v8.1d - v11.1d}, [sp]
1495            st1         {v12.1d - v15.1d}, [x8]
1496            mov         x8, x5        // x
1497            ldr         w5, [sp,#80]  // r
1498            sub         x9, x2, x8
1499            sub         x10, x3, x6
1500            mov         x2, x4        // pitch
1501            mov         x3, x7        // count
1502            sub         x7, x10, #1
1503            sub         x9, x9, x3
1504
1505            ldr         x12, [sp, #88]
1506
1507            add         x0, x0, x8, LSL #2
1508            add         x1, x1, x8, LSL #2
1509
1510            cmp         x6, x5
1511            csel        x6, x5, x6, hs
1512            cmp         x7, x5
1513            csel        x7, x5, x7, hs
1514            cmp         x8, x5
1515            csel        x8, x5, x8, hs
1516            cmp         x9, x5
1517            csel        x9, x5, x9, hs
1518
1519            lsl         x3, x3, #2
1520            add         x4, x8, x9
1521            add         x4, x3, x4, LSL #2
1522
1523            sub         x1, x1, x8, LSL #2
1524
1525            sub         x13, xzr, x2
1526            msub        x15, x2, x6, x1
1527            madd        x19, x2, x7, x1
1528
1529            ld1         {v0.8h,v1.8h}, [x12], #32
1530            ld1         {v2.8h,v3.8h}, [x12], #32
1531
1532            adr         x30, 1f
1533  .irep r, TUNED_LIST4
1534            cmp         x5, #\r
1535            bls         convolve4_\r
1536  .endr
1537            b           convolve4_25
1538
15391:          ld1         {v8.1d - v11.1d}, [sp], #32
1540            ld1         {v12.1d - v15.1d}, [sp], #32
1541            ldp         x19,x30, [sp], #16
1542            ret
1543END(rsdIntrinsicBlurU4_K)
1544