1 /* 16-bit signed integer dot product
2 * Altivec-assisted version
3 * Copyright 2004 Phil Karn
4 * May be used under the terms of the GNU Lesser General Public License (LGPL)
5 */
6 #include <stdlib.h>
7 #include "fec.h"
8
9 struct dotprod {
10 int len; /* Number of coefficients */
11
12 /* On an Altivec machine, these hold 8 copies of the coefficients,
13 * preshifted by 0,1,..7 words to meet all possible input data
14 */
15 signed short *coeffs[8];
16 };
17
18 /* Create and return a descriptor for use with the dot product function */
initdp_av(signed short coeffs[],int len)19 void *initdp_av(signed short coeffs[],int len){
20 struct dotprod *dp;
21 int i,j;
22
23 if(len == 0)
24 return NULL;
25
26 dp = (struct dotprod *)calloc(1,sizeof(struct dotprod));
27 dp->len = len;
28
29 /* Make 8 copies of coefficients, one for each data alignment,
30 * each aligned to 16-byte boundary
31 */
32 for(i=0;i<8;i++){
33 dp->coeffs[i] = calloc(1+(len+i-1)/8,sizeof(vector signed short));
34 for(j=0;j<len;j++)
35 dp->coeffs[i][j+i] = coeffs[j];
36 }
37 return (void *)dp;
38 }
39
40
41 /* Free a dot product descriptor created earlier */
freedp_av(void * p)42 void freedp_av(void *p){
43 struct dotprod *dp = (struct dotprod *)p;
44 int i;
45
46 for(i=0;i<8;i++)
47 if(dp->coeffs[i] != NULL)
48 free(dp->coeffs[i]);
49 free(dp);
50 }
51
52 /* Compute a dot product given a descriptor and an input array
53 * The length is taken from the descriptor
54 */
dotprod_av(void * p,signed short a[])55 long dotprod_av(void *p,signed short a[]){
56 struct dotprod *dp = (struct dotprod *)p;
57 int al;
58 vector signed short *ar,*d;
59 vector signed int sums0,sums1,sums2,sums3;
60 union { vector signed int v; signed int w[4];} s;
61 int nblocks;
62
63 /* round ar down to beginning of 16-byte block containing 0th element of
64 * input buffer. Then set d to one of 8 sets of shifted coefficients
65 */
66 ar = (vector signed short *)((int)a & ~15);
67 al = ((int)a & 15)/sizeof(signed short);
68 d = (vector signed short *)dp->coeffs[al];
69
70 nblocks = (dp->len+al-1)/8+1;
71
72 /* Sum into four vectors each holding four 32-bit partial sums */
73 sums3 = sums2 = sums1 = sums0 = (vector signed int)(0);
74 while(nblocks >= 4){
75 sums0 = vec_msums(ar[nblocks-1],d[nblocks-1],sums0);
76 sums1 = vec_msums(ar[nblocks-2],d[nblocks-2],sums1);
77 sums2 = vec_msums(ar[nblocks-3],d[nblocks-3],sums2);
78 sums3 = vec_msums(ar[nblocks-4],d[nblocks-4],sums3);
79 nblocks -= 4;
80 }
81 sums0 = vec_adds(sums0,sums1);
82 sums2 = vec_adds(sums2,sums3);
83 sums0 = vec_adds(sums0,sums2);
84 while(nblocks-- > 0){
85 sums0 = vec_msums(ar[nblocks],d[nblocks],sums0);
86 }
87 /* Sum 4 partial sums into final result */
88 s.v = vec_sums(sums0,(vector signed int)(0));
89
90 return s.w[3];
91 }
92
93
94