• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8$assert DATATYPE in ["QC8", "QS8", "QU8"]
9$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
16$ISA = "neonv8" if ARMV8 else "neon"
17$CPU = "a35" if ARMV8 else "a7"
18$XMIN = "VMIN.U8" if DATATYPE == "QU8" else "VMIN.S8"
19$XMAX = "VMAX.U8" if DATATYPE == "QU8" else "VMAX.S8"
20$XXTL = "VMOVL.U8" if DATATYPE == "QU8" else "VMOVL.S8"
21$SQXTXN = "VQMOVUN.S16" if DATATYPE == "QU8" else "VQMOVN.S16"
22$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
23// void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}
24//     size_t mr,                                     (r0)
25//     size_t nc,                                      r1
26//     size_t kc,                                     (r2) -> sp + 56 -> r5
27//     size_t ks,                                     (r3) -> sp + 60 -> r14
28//     const ${XINT8_T}**restrict a,            sp + 88  -> r2
29//     const void*restrict w,              sp + 92  -> r9
30//     ${XINT8_T}*restrict c,                   sp + 96  -> r11
31//     size_t cm_stride,                   sp + 100  -> r6
32//     size_t cn_stride,                   sp + 104  -> r12
33//     size_t a_offset,                    sp + 108 -> (r5)
34//     const ${XINT8_T}* zero,                  sp + 112 -> r7
35//     ${PARAMS_UNION}*params); sp + 116 -> (r5)
36
37// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
38
39// Based on cortex_a53 microkernel but with Neon loads
40
41// Register usage
42// A0   r3  d0-d1 q0
43
44// B    r9  d8-d9 q4 q5
45
46// C0  r11 d16-d17  q8  d18-d19  q9
47//         q2, q3 acc2
48
49// Unused r4, r8, r10, d15, q10-q15, q1-q3
50
51$if REQUANTIZATION == "RNDNU" and DATATYPE != "QU8":
52  // params structure is 16 bytes
53  //  struct {
54  //    int32_t right_pre_shift;    d12[0]
55  //    int32_t multiplier;         d12[1]
56  //    int32_t right_post_shift;   d13[0]
57  //    int16_t output_zero_point;  d13[2]
58  //    int8_t output_min;          d13[6]
59  //    int8_t output_max;          d13[7]
60  //  } rndnu_neon;
61$elif REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
62  // params structure is 20 bytes
63  //  struct {
64  //    uint8_t kernel_zero_point[4];  d14
65  //    int32_t right_pre_shift;       d12[0]
66  //    int32_t multiplier;            d12[1]
67  //    int32_t right_post_shift;      d13[0]
68  //    int16_t output_zero_point;     d13[2]
69  //    uint8_t output_min;            d13[6]
70  //    uint8_t output_max;            d13[7]
71  //  } rndnu_neon;
72$elif DATATYPE == "QC8" and not ARMV8:
73  // params structure is 10 bytes
74  // struct {
75  //   float magic_bias;                           d12[0]
76  //   int32_t magic_bias_less_output_zero_point;  d12[1]
77  //   int8_t output_min;                          d13[6]
78  //   int8_t output_max;                          d13[7]
79  // } xnn_qs8_minmax_params.neon;
80$else:
81  // params structure is 4 bytes
82  //  struct {
83  //    int16_t output_zero_point;  d13[2]
84  //    int8_t output_min;          d13[6]
85  //    int8_t output_max;          d13[7]
86  //  } xnn_qs8_minmax_params.neonv8;
87
88BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}
89        # Push 88 bytes
90        # r2, r3 will be reloaded in outer loop.
91        PUSH    {r2, r3, r5, r6, r7, r9, r11, lr}     // +32
92        $if DATATYPE == "QU8":
93          VPUSH   {d8-d14}                            // +56 = 88
94        $else:
95          SUB     sp, sp, 8                           // +8
96          VPUSH   {d8-d13}                            // +48 = 88
97
98        LDR     r2,  [sp, 88]           // a
99        LDR     r9,  [sp, 92]           // w
100        LDR     r11, [sp, 96]           // c
101        LDR     r6,  [sp, 100]          // cm_stride
102        LDR     r12, [sp, 104]          // cn_stride
103        LDR     r7,  [sp, 112]          // zero
104        LDR     r5,  [sp, 116]          // params
105        MOV     r14, r3                 // p = ks
106
107        # Load params values
108        $if DATATYPE == "QU8":
109          VLD1.32 {d14[]}, [r5]!          // QU8 kernel_zero_point
110        $if REQUANTIZATION == "RNDNU":
111          VLDM    r5, {d12-d13}           // RNDNU params
112        $elif DATATYPE == "QC8" and ARMV8:
113          VLD1.32 {d13[]}, [r5]           // QC8 neonv8 params
114        $elif DATATYPE == "QC8" and not ARMV8:
115          VLDM    r5!, {d12}              // QC8 neon params
116          VLD1.16 {d13[]}, [r5]
117
118        $if PREFETCH:
119          PLD     [r9,  64]               // Prefetch B
120          PLD     [r9, 112]
121          PLD     [r9, 192]
122          PLD     [r9, 256]
123          PLD     [r9, 320]
124          PLD     [r9, 384]
125
126        .p2align 3
1270:
128        # Load initial bias from w into accumulators
129        VLDM    r9!, {d16-d19}          // Bias
130        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
131        VMOV.I32 q3, 0
132
133        .p2align 3
1341:
135        # Load next A pointer
136        LDR     r3, [r2,  0]
137
138        # Add a_offset
139        LDR     r5, [sp, 108]           // a_offset
140        ADD     r2, r2, 4
141        CMP     r3,  r7                 // if a0 == zero
142        ADD     r3,  r3, r5             // a0 += a_offset
143        MOVEQ   r3,  r7                 //   a0 = zero, else += a0 + a_offset
144
145        LDR     r5, [sp, 56]            // kc
146        SUBS    r5, r5, 8               // kc - 8
147        BLO     5f                      // less than 8 channels?
148
149        // Prologue - load A0 and B0
150        VLD1.8  {d0},  [r3]!            // A0
151        SUBS    r5, r5, 8               // k = k - 8
152        VLD1.8  {d8},  [r9]!            // B0
153        BLO     3f                      // less than 8 channels?
154
155        // Main loop - 8 bytes
156        // 64 bytes for weights.
157
158        .p2align 3
1592:
160        // Extend
161        ${XXTL} q0, d0
162        $if DATATYPE == "QU8":
163          VSUBL.U8 q4, d8, d14
164        $else:
165          VMOVL.S8 q4, d8
166        $if PREFETCH:
167          PLD     [r9, 448]
168
169        // BLOCK 0
170        VLD1.8  {d10},  [r9]!           // B1
171        VMLAL.S16 q8, d8, d0[0]
172        VMLAL.S16 q9, d9, d0[0]
173        $if DATATYPE == "QU8":
174          VSUBL.U8 q5, d10, d14
175        $else:
176          VMOVL.S8 q5, d10
177
178        // BLOCK 1
179        VLD1.8  {d8},  [r9]!            // B2
180        VMLAL.S16 q2, d10, d0[1]
181        VMLAL.S16 q3, d11, d0[1]
182        $if DATATYPE == "QU8":
183          VSUBL.U8 q4, d8, d14
184        $else:
185          VMOVL.S8 q4, d8
186
187        // BLOCK 2
188        VLD1.8  {d10},  [r9]!           // B3
189        VMLAL.S16 q8, d8, d0[2]
190        VMLAL.S16 q9, d9, d0[2]
191        $if DATATYPE == "QU8":
192          VSUBL.U8 q5, d10, d14
193        $else:
194          VMOVL.S8 q5, d10
195
196        // BLOCK 3
197        VLD1.8  {d8},  [r9]!            // B4
198        VMLAL.S16 q2, d10, d0[3]
199        VMLAL.S16 q3, d11, d0[3]
200        VLD1.8  {d0},  [r3]!            // A0
201        $if DATATYPE == "QU8":
202          VSUBL.U8 q4, d8, d14
203        $else:
204          VMOVL.S8 q4, d8
205
206        // BLOCK 4
207        VLD1.8  {d10},  [r9]!           // B5
208        VMLAL.S16 q8, d8, d1[0]
209        VMLAL.S16 q9, d9, d1[0]
210        $if DATATYPE == "QU8":
211          VSUBL.U8 q5, d10, d14
212        $else:
213          VMOVL.S8 q5, d10
214
215        // BLOCK 5
216        VLD1.8  {d8},  [r9]!            // B6
217        VMLAL.S16 q2, d10, d1[1]
218        VMLAL.S16 q3, d11, d1[1]
219        $if DATATYPE == "QU8":
220          VSUBL.U8 q4, d8, d14
221        $else:
222          VMOVL.S8 q4, d8
223
224        // BLOCK 6
225        VLD1.8  {d10},  [r9]!           // B7
226        VMLAL.S16 q8, d8, d1[2]
227        VMLAL.S16 q9, d9, d1[2]
228        $if DATATYPE == "QU8":
229          VSUBL.U8 q5, d10, d14
230        $else:
231          VMOVL.S8 q5, d10
232        SUBS    r5, r5, 8
233
234        // BLOCK 7
235        VLD1.8  {d8},  [r9]!            // B0
236        VMLAL.S16 q2, d10, d1[3]
237        VMLAL.S16 q3, d11, d1[3]
238        BHS     2b
239
240        // Epilogue
241
242        .p2align 3
2433:
244        // Extend
245        ${XXTL} q0, d0
246        $if DATATYPE == "QU8":
247          VSUBL.U8 q4, d8, d14
248        $else:
249          VMOVL.S8 q4, d8
250        $if PREFETCH:
251          PLD     [r9, 448]
252
253        // BLOCK 0
254        VLD1.8  {d10},  [r9]!           // B1
255        VMLAL.S16 q8, d8, d0[0]
256        VMLAL.S16 q9, d9, d0[0]
257        $if DATATYPE == "QU8":
258          VSUBL.U8 q5, d10, d14
259        $else:
260          VMOVL.S8 q5, d10
261
262        // BLOCK 1
263        VLD1.8  {d8},  [r9]!            // B2
264        VMLAL.S16 q2, d10, d0[1]
265        VMLAL.S16 q3, d11, d0[1]
266        $if DATATYPE == "QU8":
267          VSUBL.U8 q4, d8, d14
268        $else:
269          VMOVL.S8 q4, d8
270
271        // BLOCK 2
272        VLD1.8  {d10},  [r9]!           // B3
273        VMLAL.S16 q8, d8, d0[2]
274        VMLAL.S16 q9, d9, d0[2]
275        $if DATATYPE == "QU8":
276          VSUBL.U8 q5, d10, d14
277        $else:
278          VMOVL.S8 q5, d10
279
280        // BLOCK 3
281        VLD1.8  {d8},  [r9]!            // B4
282        VMLAL.S16 q2, d10, d0[3]
283        VMLAL.S16 q3, d11, d0[3]
284        $if DATATYPE == "QU8":
285          VSUBL.U8 q4, d8, d14
286        $else:
287          VMOVL.S8 q4, d8
288
289        // BLOCK 4
290        VLD1.8  {d10},  [r9]!           // B5
291        VMLAL.S16 q8, d8, d1[0]
292        VMLAL.S16 q9, d9, d1[0]
293        $if DATATYPE == "QU8":
294          VSUBL.U8 q5, d10, d14
295        $else:
296          VMOVL.S8 q5, d10
297
298        // BLOCK 5
299        VLD1.8  {d8},  [r9]!            // B6
300        VMLAL.S16 q2, d10, d1[1]
301        VMLAL.S16 q3, d11, d1[1]
302        $if DATATYPE == "QU8":
303          VSUBL.U8 q4, d8, d14
304        $else:
305          VMOVL.S8 q4, d8
306
307        // BLOCK 6
308        VLD1.8  {d10},  [r9]!           // B7
309        VMLAL.S16 q8, d8, d1[2]
310        VMLAL.S16 q9, d9, d1[2]
311        $if DATATYPE == "QU8":
312          VSUBL.U8 q5, d10, d14
313        $else:
314          VMOVL.S8 q5, d10
315        ADDS    r5, r5, 8
316
317        VMLAL.S16 q2, d10, d1[3]
318        VMLAL.S16 q3, d11, d1[3]
319
320        # Is there a remainder?- 1-7 bytes of A
321        BNE     6f
322
3234:
324        # ks loop
325        SUBS    r14, r14, 4             // ks -= MR * sizeof(void*)
326        BHI     1b
327
328        LDR     r14, [sp, 60]           // p = ks
329
330        VADD.S32 q8, q8, q2
331        VADD.S32 q9, q9, q3
332
333        $if REQUANTIZATION == "RNDNU":
334          # RNDNU quantization
335          VDUP.32 q0, d12[0]              // right_pre_shift
336
337          VQSHL.S32 q8,  q8, q0
338          VQSHL.S32 q9,  q9, q0
339
340          VDUP.32 q2, d13[0]              // right_post_shift
341
342          VQDMULH.S32 q8,  q8, d12[1]     // multiplier
343          VQDMULH.S32 q9,  q9, d12[1]
344
345          VRSHL.S32 q8,  q8, q2
346          VRSHL.S32 q9,  q9, q2
347        $elif DATATYPE == "QC8" and ARMV8:
348          # QC8 FP32 quantization
349          VLD1.8  {q0-q1},  [r9]!
350
351          VCVT.F32.S32 q8,  q8
352          VCVT.F32.S32 q9,  q9
353
354          VMUL.F32 q8,  q8, q0            // multiplier
355          VMUL.F32 q9,  q9, q1
356
357          VCVTN.S32.F32 q8,  q8
358          VCVTN.S32.F32 q9,  q9
359        $elif DATATYPE == "QC8" and not ARMV8:
360          # QC8 FP32 quantization
361          VLD1.8  {q0-q1},  [r9]!
362
363          VDUP.32 q2, d12[0]              // magic_bias
364          VDUP.32 q3, d12[1]              // magic_bias_less_output_zero_point
365
366          VCVT.F32.S32 q8,  q8
367          VCVT.F32.S32 q9,  q9
368
369          VMUL.F32 q8,  q8, q0            // multiplier
370          VMUL.F32 q9,  q9, q1
371
372          VADD.F32 q8,  q8, q2            // magic_bias
373          VADD.F32 q9,  q9, q2
374
375          VQSUB.S32 q8,  q8, q3           // magic_bias_less_output_zero_point
376          VQSUB.S32 q9,  q9, q3
377
378        $if DATATYPE != "QC8" or ARMV8:
379          VDUP.16 q0, d13[2]              // output_zero_point
380
381        VQMOVN.S32 d16, q8
382        VQMOVN.S32 d17, q9
383
384        $if DATATYPE != "QC8" or ARMV8:
385          VQADD.S16 q8,  q8, q0
386
387        VDUP.8  d24, d13[6]             // output_min
388
389        ${SQXTXN} d0,  q8
390
391        VDUP.8  d25, d13[7]             // output_max
392
393        ${XMAX} d0, d0, d24
394
395        SUBS    r1, r1, 8
396
397        ${XMIN} d0, d0, d25
398
399        # Store full 1 x 8
400        BLO     7f
401        VST1.8  {d0}, [r11], r12
402        SUB     r2, r2, r14             // a -= ks
403        BHI     0b
404
405        $if DATATYPE == "QU8":
406          VPOP    {d8-d14}
407          ADD     sp, sp, 8               // skip r2, r3
408        $else:
409          VPOP    {d8-d13}
410          ADD     sp, sp, 16              // skip pad of 8, r2, r3
411        POP     {r5, r6, r7, r9, r11, pc}
412
413        # Remainder- 1 to 7 bytes of A
414        .p2align 3
4155:
416        AND     r5, r5, 7               // kc remainder 1 to 7
4176:
418        VLD1.8  {d0},  [r3]
419        VLD1.8  {d8},  [r9]!
420
421        ${XXTL} q0, d0
422        $if DATATYPE == "QU8":
423          VSUBL.U8 q4, d8, d14
424        $else:
425          VMOVL.S8 q4, d8
426        VMLAL.S16 q8, d8, d0[0]
427        VMLAL.S16 q9, d9, d0[0]
428        CMP     r5, 2
429        BLO     4b
430
431        VLD1.8  {d8},  [r9]!
432        $if DATATYPE == "QU8":
433          VSUBL.U8 q4, d8, d14
434        $else:
435          VMOVL.S8 q4, d8
436        VMLAL.S16 q8, d8, d0[1]
437        VMLAL.S16 q9, d9, d0[1]
438        BEQ     4b
439
440        VLD1.8  {d8},  [r9]!
441        $if DATATYPE == "QU8":
442          VSUBL.U8 q4, d8, d14
443        $else:
444          VMOVL.S8 q4, d8
445        VMLAL.S16 q8, d8, d0[2]
446        VMLAL.S16 q9, d9, d0[2]
447        CMP     r5, 4
448        BLO     4b
449
450        VLD1.8  {d8},  [r9]!
451        $if DATATYPE == "QU8":
452          VSUBL.U8 q4, d8, d14
453        $else:
454          VMOVL.S8 q4, d8
455        VMLAL.S16 q8, d8, d0[3]
456        VMLAL.S16 q9, d9, d0[3]
457        BEQ     4b
458
459        VLD1.8  {d8},  [r9]!
460        $if DATATYPE == "QU8":
461          VSUBL.U8 q4, d8, d14
462        $else:
463          VMOVL.S8 q4, d8
464        VMLAL.S16 q8, d8, d1[0]
465        VMLAL.S16 q9, d9, d1[0]
466        CMP     r5, 6
467        BLO     4b
468
469        VLD1.8  {d8},  [r9]!
470        $if DATATYPE == "QU8":
471          VSUBL.U8 q4, d8, d14
472        $else:
473          VMOVL.S8 q4, d8
474        VMLAL.S16 q8, d8, d1[1]
475        VMLAL.S16 q9, d9, d1[1]
476        BEQ     4b
477
478        VLD1.8  {d8},  [r9]!
479        $if DATATYPE == "QU8":
480          VSUBL.U8 q4, d8, d14
481        $else:
482          VMOVL.S8 q4, d8
483        VMLAL.S16 q8, d8, d1[2]
484        VMLAL.S16 q9, d9, d1[2]
485        B       4b
486
487        # Store odd width
488        .p2align 3
4897:
490        TST     r1, 4
491        BEQ     8f
492        VST1.32 {d0[0]}, [r11]!
493        VEXT.8  q0, q0, q0, 4
4948:
495        TST     r1, 2
496        BEQ     9f
497        VST1.16 {d0[0]}, [r11]!
498        VEXT.8  q0, q0, q0, 2
499
5009:
501        TST     r1, 1
502        BEQ     10f
503        VST1.8  {d0[0]}, [r11]
504
50510:
506        $if DATATYPE == "QU8":
507          VPOP    {d8-d14}
508          ADD     sp, sp, 8               // skip r2, r3
509        $else:
510          VPOP    {d8-d13}
511          ADD     sp, sp, 16              // skip pad of 8, r2, r3
512        POP     {r5, r6, r7, r9, r11, pc}
513
514END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}
515
516#ifdef __ELF__
517.section ".note.GNU-stack","",%progbits
518#endif
519