• Home
  • Raw
  • Download

Lines Matching +full:d3 +full:- +full:time +full:- +full:format

7 //     http://www.apache.org/licenses/LICENSE-2.0
37 Format; typedef
50 // http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly in Run()
62 // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0). in Run()
63 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7 in Run()
64 // (q1--q3). in Run()
65 // A 12x4 block of accumulators is stored in 32bit in q4--q15. in Run()
67 // +-----+-----+-----+-----+ in Run()
69 // Rhs +-----+-----+-----+-----+ in Run()
71 // +-----+-----+-----+-----+ in Run()
77 // +--+--+ - - - - +-----+-----+-----+-----+ in Run()
78 // |d2|d3| | q4 | q5 | q6 | q7 | in Run()
79 // |d2|d3| | q4 | q5 | q6 | q7 | in Run()
80 // |d2|d3| | q4 | q5 | q6 | q7 | in Run()
81 // |d2|d3| | q4 | q5 | q6 | q7 | in Run()
82 // +--+--+ - - - - +-----+-----+-----+-----+ in Run()
87 // +--+--+ - - - - +-----+-----+-----+-----+ in Run()
92 // +--+--+ - - - - +-----+-----+-----+-----+ in Run()
168 // harmful on A53 --- It looks as if A53 doesn't like in Run()
175 // Multiply-accumulate, level of depth 0 in Run()
193 // Multiply-accumulate, level of depth 1 in Run()
194 "vmlal.u16 q4, d3, d1[0]\n" in Run()
195 "vmlal.u16 q5, d3, d1[1]\n" in Run()
197 "vmlal.u16 q6, d3, d1[2]\n" in Run()
198 "vmlal.u16 q7, d3, d1[3]\n" in Run()
223 // Multiply-accumulate, level of depth 0 in Run()
237 // Multiply-accumulate, level of depth 1 in Run()
238 "vmlal.u16 q4, d3, d1[0]\n" in Run()
239 "vmlal.u16 q5, d3, d1[1]\n" in Run()
240 "vmlal.u16 q6, d3, d1[2]\n" in Run()
241 "vmlal.u16 q7, d3, d1[3]\n" in Run()
283 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", in Run()
298 Format; typedef
301 return "NEON, 12x4, depth 2, assuming 12-bit products"; in Name()
310 "optimized kernel (NEON 12x4, assuming 12-bit products)"); in Run()
323 // This kernel is special in that it uses local 16-bit accumulators. in Run()
325 // 16 products into a local 16-bit accumulator without risking overflow. in Run()
326 // At that point, it must accumulate these local 16-bit accumulators back in Run()
327 // into global 32-bit accumulators, which have to be stored in memory for in Run()
330 // stored in diagonal-major order like this for the first 4x4 cell: in Run()
337 // and likewise for the 2nd cell (16--31) and 3rd cell (32--47) in Run()
365 // Load global accumulators from destination matrix, column-major in Run()
372 "vld1.32 {d2,d3}, [r1]!\n" in Run()
382 // 4x4-block-wise diagonal-major order. What we effectively want to do in Run()
384 // column-major order in registers. So we achieve this by in Run()
392 "vswp d3, d6\n" in Run()
415 "vst4.32 {d1,d3,d5,d7}, [r0]!\n" in Run()
428 // Registers q4--q16 are the local 16-bit accumulators. in Run()
430 // by *two* local 16-bit accumulators: one for even levels in Run()
432 // to the scalars at even and odd indices within each q-register. in Run()
435 // is the same as was described above for the global 32-bit in Run()
436 // accumulators (3 cells of size 4x4 in diagonal-major order) in Run()
441 // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3. in Run()
443 // +--------+--------+--------+--------+ in Run()
445 // Rhs +--------+--------+--------+--------+ in Run()
447 // +--------+--------+--------+--------+ in Run()
453 // +-----+-----+ - - - +--------+--------+--------+--------+ in Run()
458 // +-----+-----+ - - - +--------+--------+--------+--------+ in Run()
463 // +-----+-----+ - - - +--------+--------+--------+--------+ in Run()
464 // |d3[0]|d3[1]| |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]| in Run()
465 // |d3[2]|d3[3]| |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]| in Run()
466 // |d3[4]|d3[5]| |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]| in Run()
467 // |d3[6]|d3[7]| |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]| in Run()
468 // +-----+-----+ - - - +--------+--------+--------+--------+ in Run()
470 // Local 16-bit accumulators in Run()
475 "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n" \ in Run()
480 /* Multiply-accumulate */ \ in Run()
483 "vmlal.u8 q12, d3, d0\n" \ in Run()
487 "vmlal.u8 q13, d3, d0\n" \ in Run()
491 "vmlal.u8 q14, d3, d0\n" \ in Run()
495 "vmlal.u8 q15, d3, d0\n" \ in Run()
505 // Clear local 16-bit accumulators in Run()
546 // This is about summing adjacent pairs of 16-bit scalars into in Run()
547 // single 32-bit scalars, so we use pairwise long addition (vpadal). in Run()
550 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" in Run()
556 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" in Run()
558 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" in Run()
564 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" in Run()
566 "vld1.32 {d0,d1,d2,d3}, [r0]!\n" in Run()
572 "vst1.32 {d0,d1,d2,d3}, [r1]!\n" in Run()
588 // (column-major) in Run()
592 // between column-major and diagonal-major orders. in Run()
600 "vld4.32 {d1,d3,d5,d7}, [r0]!\n" in Run()
619 "vswp d3, d6\n" in Run()
628 // Store into the column-major destination matrix in Run()
635 "vst1.32 {d2,d3}, [r1]!\n" in Run()
655 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", in Run()
673 Format; typedef
695 // A 2x16 block of Rhs is stored in 8 bit in d0--d3. in Run()
696 // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only in Run()
698 // twice. Only half of it, a 2x16 block, is stored in d4--d7 at in Run()
699 // any given time. in Run()
701 // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit in Run()
702 // components which need to be horizontally-added at the end) in Run()
708 // their range being [ -2^7 , 2^7 ), their products are in range in Run()
709 // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values in Run()
719 // temporary-16bit-accumulators, we have them cycle through q4--q7. in Run()
722 // Register layout (ignoring the q4--q7 temporary 16bit accumulators): in Run()
724 // +----+----+ in Run()
729 // Rhs +----+----+ in Run()
730 // | d1 | d3 | in Run()
734 // +----+----+ in Run()
740 // +--------+--------+ - - - - +----+----+ in Run()
745 // +--------+--------+ - - - - +----+----+ in Run()
766 "vldr d3, [%[rhs_ptr], #24]\n" in Run()
784 // Multiply-accumulate second-half, again into the same in Run()
789 "vmlal.s8 q5, d3, d5\n" in Run()
792 "vmlal.s8 q7, d3, d7\n" in Run()
795 // Add pairwise, accumulate into 32-bit accumulators. in Run()
815 // Multiply-accumulate second-half, again into the same in Run()
821 "vmlal.s8 q5, d3, d5\n" in Run()
825 "vmlal.s8 q7, d3, d7\n" in Run()
826 "vldr d3, [%[rhs_ptr], #24]\n" in Run()
828 // Add pairwise, accumulate into 32-bit accumulators. in Run()
846 // Multiply-accumulate second-half, again into the same in Run()
851 "vmlal.s8 q5, d3, d5\n" in Run()
853 "vmlal.s8 q7, d3, d7\n" in Run()
855 // Add pairwise, accumulate into 32-bit accumulators. in Run()
866 "vpadd.s32 d3, d22, d23\n" in Run()
876 // (each pass adds pairwise. we need to add 4-wise). in Run()
879 "vpadd.s32 d10, d1, d3\n" in Run()
888 // (each pass adds pairwise. we need to add 4-wise), in Run()
895 "vpadd.s32 d10, d1, d3\n" in Run()
898 // Add horizontally-reduced accumulators into in Run()
916 "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", in Run()
927 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
928 // requires that user inputs were originally int8. This avoids the uint8->int8
935 Format; typedef
947 Format; typedef
1008 // Multiply-accumulate second-half, again into the same in Run()
1033 // A 4x16 block of Rhs is stored in 8 bit in v0--v3. in Run()
1034 // A 4x16 block of Lhs is stored in 8 bit in v4--v7. in Run()
1036 // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit in Run()
1037 // components which need to be horizontally-added at the end) in Run()
1043 // their range being [ -2^7 , 2^7 ), their products are in range in Run()
1044 // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values in Run()
1054 // temporary-16bit-accumulators, we have them cycle through v8--v15. in Run()
1057 // Register layout (ignoring the v8--v15 temporary 16bit accumulators): in Run()
1059 // +--------+--------+--------+--------+ in Run()
1061 // Rhs +--------+--------+--------+--------+ in Run()
1063 // +--------+--------+--------+--------| in Run()
1065 // +--------+--------+--------+--------+ in Run()
1071 // +-------+-----+--------+ - - +--------+--------+--------+--------+ in Run()
1076 // +-------+--------------+ - - +--------+--------+--------+--------+ in Run()
1081 // Some multiplications and 16-bit accumulation were already done above, in Run()
1102 // Multiply-accumulate second-half, again into the same in Run()
1140 // Multiply-accumulate second-half, again into the same in Run()
1166 // Some multiplications and 16-bit accumulation were already done above, in Run()
1185 // Multiply-accumulate second-half, again into the same in Run()
1222 // (each pass adds pairwise. we need to add 4-wise). in Run()
1234 // (each pass adds pairwise. we need to add 4-wise), in Run()
1246 // Add horizontally-reduced accumulators into in Run()
1279 // Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
1280 // requires that user inputs were originally int8. This avoids the uint8->int8
1287 Format; typedef
1294 Format; typedef
1415 // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1. in Run()
1416 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4. in Run()
1417 // A 12x8 block of accumulators is stored in 32bit in v8--v31. in Run()
1419 // +--------+--------+-----+--------+--------+ in Run()
1421 // Rhs +--------+--------+-----+--------+--------+ in Run()
1423 // +--------+--------+-----+--------+--------+ in Run()
1429 // +-------+-------+ - - +--------+--------+-----+--------+--------+ in Run()
1434 // +-------+-------+ - - +--------+--------+-----+--------+--------+ in Run()
1439 // +-------+-------+ - - +--------+--------+-----+--------+--------+ in Run()
1444 // +-------+-------+ - - +--------+--------+-----+--------+--------+ in Run()
1457 // Multiply-accumulate, top third in Run()
1476 // Multiply-accumulate, middle third in Run()
1497 // Multiply-accumulate, bottom third in Run()
1528 // Multiply-accumulate, level of depth 0 in Run()
1554 // Multiply-accumulate, level of depth 1 in Run()
1643 #error This kernel requires ARM dot-product instructions. Enable them by \
1644 adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \
1651 // The dot product instructions work by taking 4 consecutive 8-bit depth
1653 // accumulating all the results into the corresponding 32-bit accumulator
1654 // lane. As such, the operation is identical to a 32-bit instruction (like
1655 // FMLA used in SGEMM), except that 4 depth values are processed at a time
1660 // performance for most processors) below with the opcode (fmla -> udot) and
1661 // types (float32 -> uint8/uint32) changed.
1664 // with "sdot" - performance should be identical to this udot kernel.
1668 Format; typedef
1779 // Start the MACs at the head of the loop - 1st cell from each side in Run()
1792 "ld1 {v2.16b}, [%[lhs_ptr]], #16\n" // Done with first Lhs cell - load in Run()
1806 "ld1 {v0.16b}, [%[rhs_ptr]], #16\n" // Done with the first Rhs cell - in Run()