1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert MR % 4 == 0 7$assert NR % 4 == 0 8$ABC = "0123456789ABCDEFGHIJKLMN" 9#include <assert.h> 10 11#include <xmmintrin.h> 12 13#include <xnnpack/ppmm.h> 14 15 16void xnn_f32_ppmm_ukernel_${MR}x${NR}__sse( 17 size_t mr, 18 size_t nc, 19 size_t kc, 20 const float*restrict a, 21 const float*restrict w, 22 float*restrict c, 23 size_t cm_stride, 24 size_t cn_stride, 25 const union xnn_f32_output_params params[restrict static 1]) 26{ 27 assert(mr != 0); 28 assert(mr <= ${MR}); 29 assert(nc != 0); 30 assert(kc != 0); 31 assert(kc % sizeof(float) == 0); 32 33 float* c0 = c; 34 $for M in range(1, MR): 35 float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride); 36 $if M % 2 == 0: 37 if XNN_UNPREDICTABLE(mr <= ${M}) { 38 c${M} = c${M-1}; 39 } 40 $elif M + 1 == MR: 41 if XNN_UNPREDICTABLE(mr != ${M+1}) { 42 c${M} = c${M-1}; 43 } 44 $else: 45 if XNN_UNPREDICTABLE(mr < ${M+1}) { 46 c${M} = c${M-1}; 47 } 48 49 do { 50 __m128 vacc0x${ABC[0:4]} = _mm_load_ps(w); 51 $for N in range(4, NR, 4): 52 __m128 vacc0x${ABC[N:N+4]} = _mm_load_ps(w + ${N}); 53 $for M in range(1, MR): 54 $for N in range(0, NR, 4): 55 __m128 vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]}; 56 w += ${NR}; 57 58 size_t k = kc; 59 do { 60 const __m128 va${ABC[0:4]} = _mm_load_ps(a); 61 $for M in range(4, MR, 4): 62 const __m128 va${ABC[M:M+4]} = _mm_load_ps(a + ${M}); 63 a += ${MR}; 64 65 const __m128 vb${ABC[0:4]} = _mm_load_ps(w); 66 $for N in range(4, NR, 4): 67 const __m128 vb${ABC[N:N+4]} = _mm_load_ps(w + ${N}); 68 w += ${NR}; 69 70 $for M in range(MR): 71 $MMMM = str(M) * 4 72 const __m128 va${MMMM} = _mm_shuffle_ps(va${ABC[M&-4:4+M&-4]}, va${ABC[M&-4:4+M&-4]}, _MM_SHUFFLE(${M % 4}, ${M % 4}, ${M % 4}, ${M % 4})); 73 74 $for N in range(0, NR, 4): 75 $for M in range(MR): 76 $MMMM = str(M) * 4 77 vacc${M}x${ABC[N:N+4]} = _mm_add_ps(vacc${M}x${ABC[N:N+4]}, _mm_mul_ps(va${MMMM}, vb${ABC[N:N+4]})); 78 79 k -= sizeof(float); 80 } while (k != 0); 81 82 const __m128 vmax = _mm_load_ps(params->sse.max); 83 $for N in range(0, NR, 4): 84 $for M in range(MR): 85 vacc${M}x${ABC[N:N+4]} = _mm_min_ps(vacc${M}x${ABC[N:N+4]}, vmax); 86 87 const __m128 vmin = _mm_load_ps(params->sse.min); 88 $for N in range(0, NR, 4): 89 $for M in range(MR): 90 vacc${M}x${ABC[N:N+4]} = _mm_max_ps(vacc${M}x${ABC[N:N+4]}, vmin); 91 92 if XNN_LIKELY(nc >= ${NR}) { 93 $for M in reversed(range(MR)): 94 _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]}); 95 $for N in range(4, NR, 4): 96 _mm_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); 97 98 a = (const float*) ((uintptr_t) a - kc * ${MR}); 99 100 $for M in reversed(range(MR)): 101 c${M} = (float*) ((uintptr_t) c${M} + cn_stride); 102 103 nc -= ${NR}; 104 } else { 105 $for LOG2N in reversed(range(NR.bit_length())): 106 $if NR != 1 << LOG2N: 107 if (nc & ${1 << LOG2N}) { 108 $if LOG2N >= 2: 109 $for M in reversed(range(MR)): 110 _mm_storeu_ps(c${M}, vacc${M}x${ABC[0:4]}); 111 $for N in range(4, 1 << LOG2N, 4): 112 _mm_storeu_ps(c${M} + ${N}, vacc${M}x${ABC[N:N+4]}); 113 114 $for M in reversed(range(MR)): 115 $for N in range(0, 1 << (LOG2N - 1), 4): 116 vacc${M}x${ABC[N:N+4]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+4]}; 117 118 $for M in reversed(range(MR)): 119 c${M} += ${1 << LOG2N}; 120 $elif LOG2N == 1: 121 $for M in reversed(range(MR)): 122 _mm_storel_pi((__m64*) c${M}, vacc${M}x${ABC[0:4]}); 123 124 $for M in reversed(range(MR)): 125 vacc${M}x${ABC[0:4]} = _mm_movehl_ps(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]}); 126 127 $for M in reversed(range(MR)): 128 c${M} += 2; 129 $elif LOG2N == 0: 130 $for M in reversed(range(MR)): 131 _mm_store_ss(c${M}, vacc${M}x${ABC[0:4]}); 132 } 133 134 nc = 0; 135 } 136 } while (nc != 0); 137} 138