kernel_neon.h - OpenGrok cross reference for /external/gemmlowp/internal/kernel

Lines Matching +full:d3 +full:- +full:time +full:- +full:format
7 //     http://www.apache.org/licenses/LICENSE-2.0
37       Format;  typedef
50 // http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly  in Run()
62         // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).  in Run()
63         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7  in Run()
64         // (q1--q3).  in Run()
65         // A 12x4 block of accumulators is stored in 32bit in q4--q15.  in Run()
67         //                   +-----+-----+-----+-----+  in Run()
69         //              Rhs  +-----+-----+-----+-----+  in Run()
71         //                   +-----+-----+-----+-----+  in Run()
77         //  +--+--+ - - - -  +-----+-----+-----+-----+  in Run()
78         //  |d2|d3|          | q4  | q5  | q6  | q7  |  in Run()
79         //  |d2|d3|          | q4  | q5  | q6  | q7  |  in Run()
80         //  |d2|d3|          | q4  | q5  | q6  | q7  |  in Run()
81         //  |d2|d3|          | q4  | q5  | q6  | q7  |  in Run()
82         //  +--+--+ - - - -  +-----+-----+-----+-----+  in Run()
87         //  +--+--+ - - - -  +-----+-----+-----+-----+  in Run()
92         //  +--+--+ - - - -  +-----+-----+-----+-----+  in Run()
168         // harmful on A53 --- It looks as if A53 doesn't like  in Run()
175         // Multiply-accumulate, level of depth 0  in Run()
193         // Multiply-accumulate, level of depth 1  in Run()
194         "vmlal.u16 q4, d3, d1[0]\n"  in Run()
195         "vmlal.u16 q5, d3, d1[1]\n"  in Run()
197         "vmlal.u16 q6, d3, d1[2]\n"  in Run()
198         "vmlal.u16 q7, d3, d1[3]\n"  in Run()
223         // Multiply-accumulate, level of depth 0  in Run()
237         // Multiply-accumulate, level of depth 1  in Run()
238         "vmlal.u16 q4, d3, d1[0]\n"  in Run()
239         "vmlal.u16 q5, d3, d1[1]\n"  in Run()
240         "vmlal.u16 q6, d3, d1[2]\n"  in Run()
241         "vmlal.u16 q7, d3, d1[3]\n"  in Run()
283         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",  in Run()
298       Format;  typedef
301     return "NEON, 12x4, depth 2, assuming 12-bit products";  in Name()
310         "optimized kernel (NEON 12x4, assuming 12-bit products)");  in Run()
323     // This kernel is special in that it uses local 16-bit accumulators.  in Run()
325     // 16 products into a local 16-bit accumulator without risking overflow.  in Run()
326     // At that point, it must accumulate these local 16-bit accumulators back  in Run()
327     // into global 32-bit accumulators, which have to be stored in memory for  in Run()
330     // stored in diagonal-major order like this for the first 4x4 cell:  in Run()
337     // and likewise for the 2nd  cell (16--31) and 3rd cell (32--47)  in Run()
365         // Load global accumulators from destination matrix, column-major  in Run()
372         "vld1.32 {d2,d3}, [r1]!\n"  in Run()
382         // 4x4-block-wise diagonal-major order. What we effectively want to do  in Run()
384         // column-major order in registers. So we achieve this by  in Run()
392         "vswp d3, d6\n"  in Run()
415         "vst4.32 {d1,d3,d5,d7}, [r0]!\n"  in Run()
428     // Registers q4--q16 are the local 16-bit accumulators.  in Run()
430     // by *two* local 16-bit accumulators: one for even levels  in Run()
432     // to the scalars at even and odd indices within each q-register.  in Run()
435     // is the same as was described above for the global 32-bit  in Run()
436     // accumulators (3 cells of size 4x4 in diagonal-major order)  in Run()
441     // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3.  in Run()
443     //                      +--------+--------+--------+--------+  in Run()
445     //                 Rhs  +--------+--------+--------+--------+  in Run()
447     //                      +--------+--------+--------+--------+  in Run()
453     //  +-----+-----+ - - - +--------+--------+--------+--------+  in Run()
458     //  +-----+-----+ - - - +--------+--------+--------+--------+  in Run()
463     //  +-----+-----+ - - - +--------+--------+--------+--------+  in Run()
464     //  |d3[0]|d3[1]|       |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]|  in Run()
465     //  |d3[2]|d3[3]|       |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]|  in Run()
466     //  |d3[4]|d3[5]|       |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]|  in Run()
467     //  |d3[6]|d3[7]|       |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]|  in Run()
468     //  +-----+-----+ - - - +--------+--------+--------+--------+  in Run()
470     //                            Local 16-bit accumulators  in Run()
475   "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n"     \  in Run()
480   /* Multiply-accumulate */                   \  in Run()
483   "vmlal.u8 q12, d3, d0\n"                    \  in Run()
487   "vmlal.u8 q13, d3, d0\n"                    \  in Run()
491   "vmlal.u8 q14, d3, d0\n"                    \  in Run()
495   "vmlal.u8 q15, d3, d0\n"                    \  in Run()
505         // Clear local 16-bit accumulators  in Run()
546         // This is about summing adjacent pairs of 16-bit scalars into  in Run()
547         // single 32-bit scalars, so we use pairwise long addition (vpadal).  in Run()
550         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"  in Run()
556         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"  in Run()
558         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"  in Run()
564         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"  in Run()
566         "vld1.32 {d0,d1,d2,d3}, [r0]!\n"  in Run()
572         "vst1.32 {d0,d1,d2,d3}, [r1]!\n"  in Run()
588         // (column-major)  in Run()
592         // between column-major and diagonal-major orders.  in Run()
600         "vld4.32 {d1,d3,d5,d7}, [r0]!\n"  in Run()
619         "vswp d3, d6\n"  in Run()
628         // Store into the column-major destination matrix  in Run()
635         "vst1.32 {d2,d3}, [r1]!\n"  in Run()
655         "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",  in Run()
673       Format;  typedef
695         // A 2x16 block of Rhs is stored in 8 bit in d0--d3.  in Run()
696         // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only  in Run()
698         // twice. Only half of it, a 2x16 block, is stored in d4--d7 at  in Run()
699         // any given time.  in Run()
701         // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit  in Run()
702         // components which need to be horizontally-added at the end)  in Run()
708         // their range being [ -2^7 , 2^7 ), their products are in range  in Run()
709         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values  in Run()
719         // temporary-16bit-accumulators, we have them cycle through q4--q7.  in Run()
722         // Register layout (ignoring the q4--q7 temporary 16bit accumulators):  in Run()
724         //                               +----+----+  in Run()
729         //                       Rhs     +----+----+  in Run()
730         //                               | d1 | d3 |  in Run()
734         //                               +----+----+  in Run()
740         //  +--------+--------+ - - - -  +----+----+  in Run()
745         //  +--------+--------+ - - - -  +----+----+  in Run()
766         "vldr d3, [%[rhs_ptr], #24]\n"  in Run()
784         // Multiply-accumulate second-half, again into the same  in Run()
789         "vmlal.s8    q5,  d3,  d5\n"  in Run()
792         "vmlal.s8    q7,  d3,  d7\n"  in Run()
795         // Add pairwise, accumulate into 32-bit accumulators.  in Run()
815         // Multiply-accumulate second-half, again into the same  in Run()
821         "vmlal.s8    q5,  d3,  d5\n"  in Run()
825         "vmlal.s8    q7,  d3,  d7\n"  in Run()
826         "vldr d3, [%[rhs_ptr], #24]\n"  in Run()
828         // Add pairwise, accumulate into 32-bit accumulators.  in Run()
846         // Multiply-accumulate second-half, again into the same  in Run()
851         "vmlal.s8    q5,  d3,  d5\n"  in Run()
853         "vmlal.s8    q7,  d3,  d7\n"  in Run()
855         // Add pairwise, accumulate into 32-bit accumulators.  in Run()
866         "vpadd.s32 d3, d22, d23\n"  in Run()
876         // (each pass adds pairwise. we need to add 4-wise).  in Run()
879         "vpadd.s32 d10, d1, d3\n"  in Run()
888         // (each pass adds pairwise. we need to add 4-wise),  in Run()
895         "vpadd.s32 d10, d1, d3\n"  in Run()
898         // Add horizontally-reduced accumulators into  in Run()
916         "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",  in Run()
927 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
928 // requires that user inputs were originally int8. This avoids the uint8->int8
935       Format;  typedef
947       Format;  typedef
1008         // Multiply-accumulate second-half, again into the same  in Run()
1033         // A 4x16 block of Rhs is stored in 8 bit in v0--v3.  in Run()
1034         // A 4x16 block of Lhs is stored in 8 bit in v4--v7.  in Run()
1036         // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit  in Run()
1037         // components which need to be horizontally-added at the end)  in Run()
1043         // their range being [ -2^7 , 2^7 ), their products are in range  in Run()
1044         // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values  in Run()
1054         // temporary-16bit-accumulators, we have them cycle through v8--v15.  in Run()
1057         // Register layout (ignoring the v8--v15 temporary 16bit accumulators):  in Run()
1059         //                               +--------+--------+--------+--------+  in Run()
1061         //                          Rhs  +--------+--------+--------+--------+  in Run()
1063         //                               +--------+--------+--------+--------|  in Run()
1065         //                               +--------+--------+--------+--------+  in Run()
1071         //  +-------+-----+--------+ - - +--------+--------+--------+--------+  in Run()
1076         //  +-------+--------------+ - - +--------+--------+--------+--------+  in Run()
1081         // Some multiplications and 16-bit accumulation were already done above,  in Run()
1102         // Multiply-accumulate second-half, again into the same  in Run()
1140         // Multiply-accumulate second-half, again into the same  in Run()
1166         // Some multiplications and 16-bit accumulation were already done above,  in Run()
1185         // Multiply-accumulate second-half, again into the same  in Run()
1222         // (each pass adds pairwise. we need to add 4-wise).  in Run()
1234         // (each pass adds pairwise. we need to add 4-wise),  in Run()
1246         // Add horizontally-reduced accumulators into  in Run()
1279 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
1280 // requires that user inputs were originally int8. This avoids the uint8->int8
1287       Format;  typedef
1294       Format;  typedef
1415         // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.  in Run()
1416         // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.  in Run()
1417         // A 12x8 block of accumulators is stored in 32bit in v8--v31.  in Run()
1419         //                         +--------+--------+-----+--------+--------+  in Run()
1421         //                    Rhs  +--------+--------+-----+--------+--------+  in Run()
1423         //                         +--------+--------+-----+--------+--------+  in Run()
1429         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+  in Run()
1434         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+  in Run()
1439         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+  in Run()
1444         //  +-------+-------+ - -  +--------+--------+-----+--------+--------+  in Run()
1457         // Multiply-accumulate, top third  in Run()
1476         // Multiply-accumulate, middle third  in Run()
1497         // Multiply-accumulate, bottom third  in Run()
1528         // Multiply-accumulate, level of depth 0  in Run()
1554         // Multiply-accumulate, level of depth 1  in Run()
1643 #error This kernel requires ARM dot-product instructions. Enable them by \
1644   adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \
1651 // The dot product instructions work by taking 4 consecutive 8-bit depth
1653 // accumulating all the results into the corresponding 32-bit accumulator
1654 // lane.  As such, the operation is identical to a 32-bit instruction (like
1655 // FMLA used in SGEMM), except that 4 depth values are processed at a time
1660 // performance for most processors) below with the opcode (fmla -> udot) and
1661 // types (float32 -> uint8/uint32) changed.
1664 // with "sdot" - performance should be identical to this udot kernel.
1668       Format;  typedef
1779         // Start the MACs at the head of the loop - 1st cell from each side  in Run()
1792         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load  in Run()
1806         "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -  in Run()