• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8$assert DATATYPE in ["QC8", "QS8", "QU8"]
9$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
10
11#include <xnnpack/assembly.h>
12
13.syntax unified
14
15$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
16$ISA = "neonv8" if ARMV8 else "neon"
17$CPU = "a35" if ARMV8 else "a7"
18$XMIN = "VMIN.U8" if DATATYPE == "QU8" else "VMIN.S8"
19$XMAX = "VMAX.U8" if DATATYPE == "QU8" else "VMAX.S8"
20$XXTL = "VMOVL.U8" if DATATYPE == "QU8" else "VMOVL.S8"
21$SQXTXN = "VQMOVUN.S16" if DATATYPE == "QU8" else "VQMOVN.S16"
22$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
23// void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}(
24//     size_t mr,                            r0
25//     size_t nc,                            r1
26//     size_t kc,                            (r2) -> r5
27//     const ${XINT8_T}*restrict a,              r3
28//     size_t a_stride,           sp + 96 -> (unused)
29//     const void*restrict w,     sp + 100 -> r9
30//     ${XINT8_T}*restrict c,         sp + 104 -> r11
31//     size_t cm_stride,          sp + 108 -> (unused)
32//     size_t cn_stride,          sp + 112 -> r7
33//     ${PARAMS_UNION} params)  sp + 116 -> (r5)
34
35// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
36
37// Based on cortex_a53 microkernel but with Neon loads
38
39// Register usage
40// A0   r3  d0-d1 q0
41
42// B    r9  d8-d9 q4 q5
43
44// C0  r11 d16-d17  q8  d18-d19  q9
45//         q2, q3 acc2
46
47// Unused r4, r6, r8, r10, r12, d15, q10-q15, q1-q3
48
49$if REQUANTIZATION == "RNDNU" and DATATYPE != "QU8":
50  // params structure is 16 bytes
51  //  struct {
52  //    int32_t right_pre_shift;    d12[0]
53  //    int32_t multiplier;         d12[1]
54  //    int32_t right_post_shift;   d13[0]
55  //    int16_t output_zero_point;  d13[2]
56  //    int8_t output_min;          d13[6]
57  //    int8_t output_max;          d13[7]
58  //  } rndnu_neon;
59$elif REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
60  # params structure is 20 bytes
61  #  struct {
62  #    uint8_t kernel_zero_point[4];  d14
63  #    int32_t right_pre_shift;       d12[0]
64  #    int32_t multiplier;            d12[1]
65  #    int32_t right_post_shift;      d13[0]
66  #    int16_t output_zero_point;     d13[2]
67  #    uint8_t output_min;            d13[6]
68  #    uint8_t output_max;            d13[7]
69  #  } rndnu_neon;
70$elif DATATYPE == "QC8" and not ARMV8:
71  // params structure is 10 bytes
72  // struct {
73  //   float magic_bias;                           d12[0]
74  //   int32_t magic_bias_less_output_zero_point;  d12[1]
75  //   int8_t output_min;                          d13[6]
76  //   int8_t output_max;                          d13[7]
77  // } xnn_qs8_minmax_params.neon;
78$else:
79  // params structure is 4 bytes
80  //  struct {
81  //    int16_t output_zero_point;  d13[2]
82  //    int8_t output_min;          d13[6]
83  //    int8_t output_max;          d13[7]
84  //  } xnn_qs8_minmax_params.neonv8;
85
86BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}
87        # Push 96 bytes
88        PUSH    {r5, r7, r9, r11}                   // 16
89        $if DATATYPE == "QU8":
90          SUB     sp, sp, 24                          // +24
91          VPUSH   {d8-d14}                            // +56 = 96
92        $else:
93          SUB     sp, sp, 32                          // +32
94          VPUSH   {d8-d13}                            // +48 = 96
95
96        LDR     r11, [sp, 104]          // c
97        LDR     r9, [sp, 100]           // w
98        LDR     r5, [sp, 116]           // params
99
100        # Load params values
101        $if DATATYPE == "QU8":
102          VLD1.32 {d14[]}, [r5]!          // QU8 kernel_zero_point
103        $if REQUANTIZATION == "RNDNU":
104          VLDM    r5, {d12-d13}           // RNDNU params
105        $elif DATATYPE == "QC8" and ARMV8:
106          VLD1.32 {d13[]}, [r5]           // QC8 neonv8 params
107        $elif DATATYPE == "QC8" and not ARMV8:
108          VLDM    r5!, {d12}              // QC8 neon params
109          VLD1.16 {d13[]}, [r5]           // output_min/max
110        LDR     r7, [sp, 112]            // cn_stride
111
112        $if PREFETCH:
113          PLD     [r9,  64]               // Prefetch B
114          PLD     [r9, 128]
115          PLD     [r9, 192]
116          PLD     [r9, 256]
117          PLD     [r9, 320]
118          PLD     [r9, 384]
119
120        .p2align 3
1210:
122        # Load initial bias from w into accumulators
123        VLDM    r9!, {d16-d19}          // Bias
124        VMOV.I32 q2, 0                  // second set of C for pipelining FMLA
125        SUBS    r5, r2, 8               // k = kc - 8
126        VMOV.I32 q3, 0
127        $if PREFETCH:
128          PLD     [r3,  64]               // Prefetch A
129        BLO     4f                      // less than 8 channels?
130
131        // Prologue - load A0 and B0
132        VLD1.8  {d0},  [r3]!            // A0
133        SUBS    r5, r5, 8               // k = k - 8
134        VLD1.8  {d8},  [r9]!            // B0
135        BLO     2f                      // less than 8 channels?
136
137        // Main loop - 8 bytes
138        // 64 bytes for weights.
139
140        .p2align 3
1411:
142        // Extend
143        ${XXTL} q0, d0
144        $if DATATYPE == "QU8":
145          VSUBL.U8 q4, d8, d14
146        $else:
147          VMOVL.S8 q4, d8
148        $if PREFETCH:
149          PLD     [r9, 448]
150
151        // BLOCK 0
152        VLD1.8  {d10},  [r9]!           // B1
153        VMLAL.S16 q8, d8, d0[0]
154        VMLAL.S16 q9, d9, d0[0]
155        $if DATATYPE == "QU8":
156          VSUBL.U8 q5, d10, d14
157        $else:
158          VMOVL.S8 q5, d10
159
160        // BLOCK 1
161        VLD1.8  {d8},  [r9]!            // B2
162        VMLAL.S16 q2, d10, d0[1]
163        VMLAL.S16 q3, d11, d0[1]
164        $if DATATYPE == "QU8":
165          VSUBL.U8 q4, d8, d14
166        $else:
167          VMOVL.S8 q4, d8
168
169        // BLOCK 2
170        VLD1.8  {d10},  [r9]!           // B3
171        VMLAL.S16 q8, d8, d0[2]
172        VMLAL.S16 q9, d9, d0[2]
173        $if DATATYPE == "QU8":
174          VSUBL.U8 q5, d10, d14
175        $else:
176          VMOVL.S8 q5, d10
177
178        // BLOCK 3
179        VLD1.8  {d8},  [r9]!            // B4
180        VMLAL.S16 q2, d10, d0[3]
181        VMLAL.S16 q3, d11, d0[3]
182        VLD1.8  {d0},  [r3]!            // A0
183        $if DATATYPE == "QU8":
184          VSUBL.U8 q4, d8, d14
185        $else:
186          VMOVL.S8 q4, d8
187
188        // BLOCK 4
189        VLD1.8  {d10},  [r9]!           // B5
190        VMLAL.S16 q8, d8, d1[0]
191        VMLAL.S16 q9, d9, d1[0]
192        $if DATATYPE == "QU8":
193          VSUBL.U8 q5, d10, d14
194        $else:
195          VMOVL.S8 q5, d10
196
197        // BLOCK 5
198        VLD1.8  {d8},  [r9]!            // B6
199        VMLAL.S16 q2, d10, d1[1]
200        VMLAL.S16 q3, d11, d1[1]
201        $if DATATYPE == "QU8":
202          VSUBL.U8 q4, d8, d14
203        $else:
204          VMOVL.S8 q4, d8
205
206        // BLOCK 6
207        VLD1.8  {d10},  [r9]!           // B7
208        VMLAL.S16 q8, d8, d1[2]
209        VMLAL.S16 q9, d9, d1[2]
210        $if DATATYPE == "QU8":
211          VSUBL.U8 q5, d10, d14
212        $else:
213          VMOVL.S8 q5, d10
214
215        // BLOCK 7
216        VLD1.8  {d8},  [r9]!            // B0
217        VMLAL.S16 q2, d10, d1[3]
218        VMLAL.S16 q3, d11, d1[3]
219        SUBS    r5, r5, 8
220        BHS     1b
221
222        // Epilogue
223
224        .p2align 3
2252:
226        ${XXTL} q0, d0
227        $if DATATYPE == "QU8":
228          VSUBL.U8 q4, d8, d14
229        $else:
230          VMOVL.S8 q4, d8
231
232        VLD1.8  {d10},  [r9]!           // B1
233        VMLAL.S16 q8, d8, d0[0]
234        VMLAL.S16 q9, d9, d0[0]
235        $if DATATYPE == "QU8":
236          VSUBL.U8 q5, d10, d14
237        $else:
238          VMOVL.S8 q5, d10
239
240        VLD1.8  {d8},  [r9]!            // B2
241        VMLAL.S16 q2, d10, d0[1]
242        VMLAL.S16 q3, d11, d0[1]
243        $if DATATYPE == "QU8":
244          VSUBL.U8 q4, d8, d14
245        $else:
246          VMOVL.S8 q4, d8
247
248        VLD1.8  {d10},  [r9]!           // B3
249        VMLAL.S16 q8, d8, d0[2]
250        VMLAL.S16 q9, d9, d0[2]
251        $if DATATYPE == "QU8":
252          VSUBL.U8 q5, d10, d14
253        $else:
254          VMOVL.S8 q5, d10
255
256        VLD1.8  {d8},  [r9]!            // B4
257        VMLAL.S16 q2, d10, d0[3]
258        VMLAL.S16 q3, d11, d0[3]
259        $if DATATYPE == "QU8":
260          VSUBL.U8 q4, d8, d14
261        $else:
262          VMOVL.S8 q4, d8
263
264        VLD1.8  {d10},  [r9]!           // B5
265        VMLAL.S16 q8, d8, d1[0]
266        VMLAL.S16 q9, d9, d1[0]
267        $if DATATYPE == "QU8":
268          VSUBL.U8 q5, d10, d14
269        $else:
270          VMOVL.S8 q5, d10
271
272        VLD1.8  {d8},  [r9]!            // B6
273        VMLAL.S16 q2, d10, d1[1]
274        VMLAL.S16 q3, d11, d1[1]
275        $if DATATYPE == "QU8":
276          VSUBL.U8 q4, d8, d14
277        $else:
278          VMOVL.S8 q4, d8
279
280        VLD1.8  {d10},  [r9]!           // B7
281        VMLAL.S16 q8, d8, d1[2]
282        VMLAL.S16 q9, d9, d1[2]
283        $if DATATYPE == "QU8":
284          VSUBL.U8 q5, d10, d14
285        $else:
286          VMOVL.S8 q5, d10
287        ADDS    r5, r5, 8
288
289        VMLAL.S16 q2, d10, d1[3]
290        VMLAL.S16 q3, d11, d1[3]
291
292        # Is there a remainder?- 1-7 bytes of A
293        BNE     4f
294
2953:
296        VADD.S32 q8, q8, q2
297        VADD.S32 q9, q9, q3
298
299        $if REQUANTIZATION == "RNDNU":
300          # RNDNU quantization
301          VDUP.32 q0, d12[0]              // right_pre_shift
302
303          VQSHL.S32 q8,  q8, q0
304          VQSHL.S32 q9,  q9, q0
305
306          VDUP.32 q2, d13[0]              // right_post_shift
307
308          VQDMULH.S32 q8,  q8, d12[1]     // multiplier
309          VQDMULH.S32 q9,  q9, d12[1]
310
311          VRSHL.S32 q8,  q8, q2
312          VRSHL.S32 q9,  q9, q2
313        $elif DATATYPE == "QC8" and ARMV8:
314          # QC8 FP32 quantization
315          VLD1.8  {q0-q1},  [r9]!
316
317          VCVT.F32.S32 q8,  q8
318          VCVT.F32.S32 q9,  q9
319
320          VMUL.F32 q8,  q8, q0            // multiplier
321          VMUL.F32 q9,  q9, q1
322
323          VCVTN.S32.F32 q8,  q8
324          VCVTN.S32.F32 q9,  q9
325        $elif DATATYPE == "QC8" and not ARMV8:
326          # QC8 FP32 quantization
327          VLD1.8  {q0-q1},  [r9]!
328
329          VDUP.32 q2, d12[0]              // magic_bias
330          VDUP.32 q3, d12[1]              // magic_bias_less_output_zero_point
331
332          VCVT.F32.S32 q8,  q8
333          VCVT.F32.S32 q9,  q9
334
335          VMUL.F32 q8,  q8, q0            // multiplier
336          VMUL.F32 q9,  q9, q1
337
338          VADD.F32 q8,  q8, q2            // magic_bias
339          VADD.F32 q9,  q9, q2
340
341          VQSUB.S32 q8,  q8, q3           // magic_bias_less_output_zero_point
342          VQSUB.S32 q9,  q9, q3
343
344        $if DATATYPE != "QC8" or ARMV8:
345          VDUP.16 q0, d13[2]              // output_zero_point
346
347        VQMOVN.S32 d16, q8
348        VQMOVN.S32 d17, q9
349
350        $if DATATYPE != "QC8" or ARMV8:
351          VQADD.S16 q8,  q8, q0
352
353        VDUP.8  d24, d13[6]             // output_min
354
355        ${SQXTXN} d0,  q8
356
357        VDUP.8  d25, d13[7]             // output_max
358
359        ${XMAX} d0, d0, d24
360
361        SUBS    r1, r1, 8
362
363        ${XMIN} d0, d0, d25
364
365        # Store full 1 x 8
366        BLO     5f
367        VST1.8  {d0}, [r11], r7
368        SUB     r3, r3, r2
369        BHI     0b
370
371        $if DATATYPE == "QU8":
372          VPOP    {d8-d14}
373          ADD     sp, sp, 8               // skip pad of 8
374        $else:
375          VPOP    {d8-d13}
376          ADD     sp, sp, 16              // skip pad of 8 + d14
377        ADD     sp, sp, 16
378        POP     {r5, r7, r9, r11}
379        BX      lr
380
381        # Remainder- 1 to 7 bytes of A
382        .p2align 3
3834:
384        AND     r5, r5, 7               // kc remainder 1 to 7
385
386        VLD1.8  {d0},  [r3], r5
387        VLD1.8  {d8},  [r9]!
388
389        ${XXTL} q0, d0
390        $if DATATYPE == "QU8":
391          VSUBL.U8 q4, d8, d14
392        $else:
393          VMOVL.S8 q4, d8
394        VMLAL.S16 q8, d8, d0[0]
395        VMLAL.S16 q9, d9, d0[0]
396        CMP     r5, 2
397        BLO     3b
398
399        VLD1.8  {d8},  [r9]!
400        $if DATATYPE == "QU8":
401          VSUBL.U8 q4, d8, d14
402        $else:
403          VMOVL.S8 q4, d8
404        VMLAL.S16 q8, d8, d0[1]
405        VMLAL.S16 q9, d9, d0[1]
406        BEQ     3b
407
408        VLD1.8  {d8},  [r9]!
409        $if DATATYPE == "QU8":
410          VSUBL.U8 q4, d8, d14
411        $else:
412          VMOVL.S8 q4, d8
413        VMLAL.S16 q8, d8, d0[2]
414        VMLAL.S16 q9, d9, d0[2]
415        CMP     r5, 4
416        BLO     3b
417
418        VLD1.8  {d8},  [r9]!
419        $if DATATYPE == "QU8":
420          VSUBL.U8 q4, d8, d14
421        $else:
422          VMOVL.S8 q4, d8
423        VMLAL.S16 q8, d8, d0[3]
424        VMLAL.S16 q9, d9, d0[3]
425        BEQ     3b
426
427        VLD1.8  {d8},  [r9]!
428        $if DATATYPE == "QU8":
429          VSUBL.U8 q4, d8, d14
430        $else:
431          VMOVL.S8 q4, d8
432        VMLAL.S16 q8, d8, d1[0]
433        VMLAL.S16 q9, d9, d1[0]
434        CMP     r5, 6
435        BLO     3b
436
437        VLD1.8  {d8},  [r9]!
438        $if DATATYPE == "QU8":
439          VSUBL.U8 q4, d8, d14
440        $else:
441          VMOVL.S8 q4, d8
442        VMLAL.S16 q8, d8, d1[1]
443        VMLAL.S16 q9, d9, d1[1]
444        BEQ     3b
445
446        VLD1.8  {d8},  [r9]!
447        $if DATATYPE == "QU8":
448          VSUBL.U8 q4, d8, d14
449        $else:
450          VMOVL.S8 q4, d8
451        VMLAL.S16 q8, d8, d1[2]
452        VMLAL.S16 q9, d9, d1[2]
453        B       3b
454
455        # Store odd width
456        .p2align 3
4575:
458        TST     r1, 4
459        BEQ     6f
460        VST1.32 {d0[0]}, [r11]!
461        VEXT.8  q0, q0, q0, 4
4626:
463        TST     r1, 2
464        BEQ     7f
465        VST1.16 {d0[0]}, [r11]!
466        VEXT.8  q0, q0, q0, 2
4677:
468        TST     r1, 1
469        BEQ     8f
470        VST1.8  {d0[0]}, [r11]
4718:
472        $if DATATYPE == "QU8":
473          VPOP    {d8-d14}
474          ADD     sp, sp, 8               // skip pad of 8
475        $else:
476          VPOP    {d8-d13}
477          ADD     sp, sp, 16              // skip pad of 8 + d14
478        ADD     sp, sp, 16
479        POP     {r5, r7, r9, r11}
480        BX      lr
481
482END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x8__aarch32_${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}_cortex_${CPU}
483
484#ifdef __ELF__
485.section ".note.GNU-stack","",%progbits
486#endif
487
488