• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sys/mman.h>
18 #include <unistd.h>
19 
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 #include "linkloader/include/MemChunk.h"
23 
24 #include <sys/mman.h>
25 #include <stddef.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 //#include <utils/StopWatch.h>
29 
30 
31 /*  uint kernel
32  *  Q0  D0:  Load slot for R
33  *      D1:  Load slot for G
34  *  Q1  D2:  Load slot for B
35  *      D3:  Load slot for A
36  *  Q2  D4:  Matrix
37  *      D5:  =
38  *  Q3  D6:  =
39  *      D7:  =
40  *  Q4  D8:  Add R
41  *      D9:
42  *  Q5  D10: Add G
43  *      D11:
44  *  Q6  D12: Add B
45  *      D13:
46  *  Q7  D14: Add A
47  *      D15:
48  *  Q8  D16:  I32: R Sum
49  *      D17:
50  *  Q9  D18:  I32: G Sum
51  *      D19:
52  *  Q10 D20:  I32: B Sum
53  *      D21:
54  *  Q11 D22:  I32: A Sum
55  *      D23:
56  *  Q12 D24:  U16: expanded R
57  *      D25:
58  *  Q13 D26:  U16: expanded G
59  *      D27:
60  *  Q14 D28:  U16: expanded B
61  *      D29:
62  *  Q15 D30:  U16: expanded A
63  *      D31:
64  *
65  */
66 
67 /*  float kernel
68  *  Q0  D0:  Load slot for R
69  *      D1:  =
70  *  Q1  D2:  Load slot for G
71  *      D3:  =
72  *  Q2  D4:  Load slot for B
73  *      D5:  =
74  *  Q3  D6:  Load slot for A
75  *      D7:  =
76  *  Q4  D8:  Matrix
77  *      D9:  =
78  *  Q5  D10: =
79  *      D11: =
80  *  Q6  D12: =
81  *      D13: =
82  *  Q7  D14: =
83  *      D15: =
84  *  Q8  D16: Add R
85  *      D17: =
86  *  Q9  D18: Add G
87  *      D19: =
88  *  Q10 D20: Add B
89  *      D21: =
90  *  Q11 D22: Add A
91  *      D23: =
92  *  Q12 D24: Sum R
93  *      D25: =
94  *  Q13 D26: Sum G
95  *      D27: =
96  *  Q14 D28: Sum B
97  *      D29: =
98  *  Q15 D30: Sum A
99  *      D31: =
100  *
101  */
102 
103 
104 
105 using namespace android;
106 using namespace android::renderscript;
107 
108 namespace android {
109 namespace renderscript {
110 
111 typedef union {
112     uint64_t key;
113     struct {
114         uint32_t inVecSize          :2;  // [0 - 1]
115         uint32_t outVecSize         :2;  // [2 - 3]
116         uint32_t inType             :4;  // [4 - 7]
117         uint32_t outType            :4;  // [8 - 11]
118         uint32_t dot                :1;  // [12]
119         uint32_t _unused1           :1;  // [13]
120         uint32_t copyAlpha          :1;  // [14]
121         uint32_t _unused2           :1;  // [15]
122         uint32_t coeffMask          :16; // [16-31]
123         uint32_t addMask            :4;  // [32-35]
124     } u;
125 } Key_t;
126 
127 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
128 public:
129     virtual void populateScript(Script *);
130 
131     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
132 
133     virtual ~RsdCpuScriptIntrinsicColorMatrix();
134     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
135 
136     virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
137                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
138     virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
139                             const void * usr, uint32_t usrLen, const RsScriptCall *sc);
140 
141 protected:
142     float fp[16];
143     float fpa[4];
144 
145     // The following four fields are read as constants
146     // by the SIMD assembly code.
147     short ip[16];
148     int ipa[16];
149     float tmpFp[16];
150     float tmpFpa[16];
151 
152     static void kernel(const RsForEachStubParamStruct *p,
153                        uint32_t xstart, uint32_t xend,
154                        uint32_t instep, uint32_t outstep);
155     void updateCoeffCache(float fpMul, float addMul);
156 
157     Key_t mLastKey;
158     unsigned char *mBuf;
159     size_t mBufSize;
160 
161     Key_t computeKey(const Element *ein, const Element *eout);
162 
163     bool build(Key_t key);
164 
165     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
166 
167 };
168 
169 }
170 }
171 
172 
computeKey(const Element * ein,const Element * eout)173 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
174         const Element *ein, const Element *eout) {
175 
176     Key_t key;
177     key.key = 0;
178 
179     // Compute a unique code key for this operation
180 
181     // Add to the key the input and output types
182     bool hasFloat = false;
183     if (ein->getType() == RS_TYPE_FLOAT_32) {
184         hasFloat = true;
185         key.u.inType = RS_TYPE_FLOAT_32;
186         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
187     }
188     if (eout->getType() == RS_TYPE_FLOAT_32) {
189         hasFloat = true;
190         key.u.outType = RS_TYPE_FLOAT_32;
191         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
192     }
193 
194     // Mask in the bits indicating which coefficients in the
195     // color matrix are needed.
196     if (hasFloat) {
197         for (uint32_t i=0; i < 16; i++) {
198             if (fabs(fp[i]) != 0.f) {
199                 key.u.coeffMask |= 1 << i;
200             }
201         }
202         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
203         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
204         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
205         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
206 
207     } else {
208         for (uint32_t i=0; i < 16; i++) {
209             if (ip[i] != 0) {
210                 key.u.coeffMask |= 1 << i;
211             }
212         }
213         if (ipa[0] != 0) key.u.addMask |= 0x1;
214         if (ipa[4] != 0) key.u.addMask |= 0x2;
215         if (ipa[8] != 0) key.u.addMask |= 0x4;
216         if (ipa[12] != 0) key.u.addMask |= 0x8;
217     }
218 
219     // Look for a dot product where the r,g,b colums are the same
220     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
221         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
222         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
223         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
224 
225         if (!key.u.addMask) key.u.dot = 1;
226     }
227 
228     // Is alpha a simple copy
229     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
230         key.u.copyAlpha = !(key.u.inType || key.u.outType);
231     }
232 
233     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
234 
235     switch (ein->getVectorSize()) {
236     case 4:
237         key.u.inVecSize = 3;
238         break;
239     case 3:
240         key.u.inVecSize = 2;
241         key.u.coeffMask &= ~0xF000;
242         break;
243     case 2:
244         key.u.inVecSize = 1;
245         key.u.coeffMask &= ~0xFF00;
246         break;
247     default:
248         key.u.coeffMask &= ~0xFFF0;
249         break;
250     }
251 
252     switch (eout->getVectorSize()) {
253     case 4:
254         key.u.outVecSize = 3;
255         break;
256     case 3:
257         key.u.outVecSize = 2;
258         key.u.coeffMask &= ~0x8888;
259         break;
260     case 2:
261         key.u.outVecSize = 1;
262         key.u.coeffMask &= ~0xCCCC;
263         break;
264     default:
265         key.u.coeffMask &= ~0xEEEE;
266         break;
267     }
268 
269     if (key.u.inType && !key.u.outType) {
270         key.u.addMask |= 1;
271         if (key.u.outVecSize > 0) key.u.addMask |= 2;
272         if (key.u.outVecSize > 1) key.u.addMask |= 4;
273         if (key.u.outVecSize > 2) key.u.addMask |= 8;
274     }
275 
276     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
277     return key;
278 }
279 
280 #if defined(ARCH_ARM_HAVE_NEON)
281 
282 #define DEF_SYM(x)                                  \
283     extern "C" uint32_t _N_ColorMatrix_##x;      \
284     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
285     extern "C" uint32_t _N_ColorMatrix_##x##_len;
286 
287 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)288 DEF_SYM(prefix_f)
289 DEF_SYM(postfix1)
290 DEF_SYM(postfix2)
291 
292 DEF_SYM(load_u8_4)
293 DEF_SYM(load_u8_3)
294 DEF_SYM(load_u8_2)
295 DEF_SYM(load_u8_1)
296 DEF_SYM(load_u8f_4)
297 DEF_SYM(load_u8f_3)
298 DEF_SYM(load_u8f_2)
299 DEF_SYM(load_u8f_1)
300 DEF_SYM(load_f32_4)
301 DEF_SYM(load_f32_3)
302 DEF_SYM(load_f32_2)
303 DEF_SYM(load_f32_1)
304 
305 DEF_SYM(store_u8_4)
306 DEF_SYM(store_u8_2)
307 DEF_SYM(store_u8_1)
308 DEF_SYM(store_f32_4)
309 DEF_SYM(store_f32_3)
310 DEF_SYM(store_f32_2)
311 DEF_SYM(store_f32_1)
312 DEF_SYM(store_f32u_4)
313 DEF_SYM(store_f32u_2)
314 DEF_SYM(store_f32u_1)
315 
316 DEF_SYM(unpack_u8_4)
317 DEF_SYM(unpack_u8_3)
318 DEF_SYM(unpack_u8_2)
319 DEF_SYM(unpack_u8_1)
320 DEF_SYM(pack_u8_4)
321 DEF_SYM(pack_u8_3)
322 DEF_SYM(pack_u8_2)
323 DEF_SYM(pack_u8_1)
324 DEF_SYM(dot)
325 DEF_SYM(add_0_u8)
326 DEF_SYM(add_1_u8)
327 DEF_SYM(add_2_u8)
328 DEF_SYM(add_3_u8)
329 
330 #define ADD_CHUNK(x) \
331     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
332     buf += _N_ColorMatrix_##x##_len
333 
334 
335 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
336     size_t off = (target - buf - 8) >> 2;
337     rsAssert(((off & 0xff000000) == 0) ||
338            ((off & 0xff000000) == 0xff000000));
339 
340     uint32_t op = (condition << 28);
341     op |= 0xa << 24;  // branch
342     op |= 0xffffff & off;
343     ((uint32_t *)buf)[0] = op;
344     return buf + 4;
345 }
346 
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)347 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
348     rsAssert(vd < 32);
349     rsAssert(vm < 32);
350     rsAssert(vn < 32);
351 
352     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
353     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
354     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
355     return op;
356 }
357 
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)358 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
359     //vmlal.s16 Q#1, D#1, D#2[#]
360     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
361     ((uint32_t *)buf)[0] = op;
362     return buf + 4;
363 }
364 
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)365 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
366     //vmull.s16 Q#1, D#1, D#2[#]
367     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
368     ((uint32_t *)buf)[0] = op;
369     return buf + 4;
370 }
371 
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)372 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
373     //vqadd.s32 Q#1, D#1, D#2
374     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
375     ((uint32_t *)buf)[0] = op;
376     return buf + 4;
377 }
378 
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)379 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
380     //vmlal.f32 Q#1, D#1, D#2[#]
381     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
382     ((uint32_t *)buf)[0] = op;
383     return buf + 4;
384 }
385 
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)386 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
387     //vmull.f32 Q#1, D#1, D#2[#]
388     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
389     ((uint32_t *)buf)[0] = op;
390     return buf + 4;
391 }
392 
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)393 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
394     //vadd.f32 Q#1, D#1, D#2
395     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
396     ((uint32_t *)buf)[0] = op;
397     return buf + 4;
398 }
399 
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)400 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
401     //vadd.f32 Q#1, D#1, D#2
402     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
403     ((uint32_t *)buf)[0] = op;
404     return buf + 4;
405 }
406 #endif
407 
408 
build(Key_t key)409 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
410 #if defined(ARCH_ARM_HAVE_NEON)
411     mBufSize = 4096;
412     //StopWatch build_time("rs cm: build time");
413     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
414                                   MAP_PRIVATE | MAP_ANON, -1, 0);
415     if (!mBuf) {
416         return false;
417     }
418 
419     uint8_t *buf = mBuf;
420     uint8_t *buf2 = NULL;
421 
422     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
423     int opInit[4] = {0, 0, 0, 0};
424 
425     memset(ops, 0, sizeof(ops));
426     for (int i=0; i < 4; i++) {
427         if (key.u.coeffMask & (1 << (i*4))) {
428             ops[i][0] = 0x2 | opInit[0];
429             opInit[0] = 1;
430         }
431         if (!key.u.dot) {
432             if (key.u.coeffMask & (1 << (1 + i*4))) {
433                 ops[i][1] = 0x2 | opInit[1];
434                 opInit[1] = 1;
435             }
436             if (key.u.coeffMask & (1 << (2 + i*4))) {
437                 ops[i][2] = 0x2 | opInit[2];
438                 opInit[2] = 1;
439             }
440         }
441         if (!key.u.copyAlpha) {
442             if (key.u.coeffMask & (1 << (3 + i*4))) {
443                 ops[i][3] = 0x2 | opInit[3];
444                 opInit[3] = 1;
445             }
446         }
447     }
448 
449     if (key.u.inType || key.u.outType) {
450         key.u.copyAlpha = 0;
451         ADD_CHUNK(prefix_f);
452         buf2 = buf;
453 
454         // Load the incoming r,g,b,a as needed
455         if (key.u.inType) {
456             switch(key.u.inVecSize) {
457             case 3:
458                 ADD_CHUNK(load_f32_4);
459                 break;
460             case 2:
461                 ADD_CHUNK(load_f32_3);
462                 break;
463             case 1:
464                 ADD_CHUNK(load_f32_2);
465                 break;
466             case 0:
467                 ADD_CHUNK(load_f32_1);
468                 break;
469             }
470         } else {
471             switch(key.u.inVecSize) {
472             case 3:
473                 ADD_CHUNK(load_u8f_4);
474                 break;
475             case 2:
476                 ADD_CHUNK(load_u8f_3);
477                 break;
478             case 1:
479                 ADD_CHUNK(load_u8f_2);
480                 break;
481             case 0:
482                 ADD_CHUNK(load_u8f_1);
483                 break;
484             }
485         }
486 
487         for (int i=0; i < 4; i++) {
488             for (int j=0; j < 4; j++) {
489                 switch(ops[i][j]) {
490                 case 0:
491                     break;
492                 case 2:
493                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
494                     break;
495                 case 3:
496                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
497                     break;
498                 }
499             }
500         }
501         for (int j=0; j < 4; j++) {
502             if (opInit[j]) {
503                 if (key.u.addMask & (1 << j)) {
504                     buf = addVADD_F32(buf, j, 12+j, 8+j);
505                 } else {
506                     buf = addVORR_32(buf, j, 12+j, 12+j);
507                 }
508             } else {
509                 if (key.u.addMask & (1 << j)) {
510                     buf = addVADD_F32(buf, j, j, 8+j);
511                 }
512             }
513         }
514 
515         if (key.u.outType) {
516             switch(key.u.outVecSize) {
517             case 3:
518                 ADD_CHUNK(store_f32_4);
519                 break;
520             case 2:
521                 ADD_CHUNK(store_f32_3);
522                 break;
523             case 1:
524                 ADD_CHUNK(store_f32_2);
525                 break;
526             case 0:
527                 ADD_CHUNK(store_f32_1);
528                 break;
529             }
530         } else {
531             switch(key.u.outVecSize) {
532             case 3:
533             case 2:
534                 ADD_CHUNK(store_f32u_4);
535                 break;
536             case 1:
537                 ADD_CHUNK(store_f32u_2);
538                 break;
539             case 0:
540                 ADD_CHUNK(store_f32u_1);
541                 break;
542             }
543         }
544 
545 
546     } else {
547         // Add the function prefix
548         // Store the address for the loop return
549         ADD_CHUNK(prefix_i);
550         buf2 = buf;
551 
552         // Load the incoming r,g,b,a as needed
553         switch(key.u.inVecSize) {
554         case 3:
555             ADD_CHUNK(load_u8_4);
556             if (key.u.copyAlpha) {
557                 ADD_CHUNK(unpack_u8_3);
558             } else {
559                 ADD_CHUNK(unpack_u8_4);
560             }
561             break;
562         case 2:
563             ADD_CHUNK(load_u8_3);
564             ADD_CHUNK(unpack_u8_3);
565             break;
566         case 1:
567             ADD_CHUNK(load_u8_2);
568             ADD_CHUNK(unpack_u8_2);
569             break;
570         case 0:
571             ADD_CHUNK(load_u8_1);
572             ADD_CHUNK(unpack_u8_1);
573             break;
574         }
575 
576         // Add multiply and accumulate
577         // use MULL to init the output register,
578         // use MLAL from there
579         for (int i=0; i < 4; i++) {
580             for (int j=0; j < 4; j++) {
581                 switch(ops[i][j]) {
582                 case 0:
583                     break;
584                 case 2:
585                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
586                     break;
587                 case 3:
588                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
589                     break;
590                 }
591             }
592         }
593         for (int j=0; j < 4; j++) {
594             if (opInit[j]) {
595                 if (key.u.addMask & (1 << j)) {
596                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
597                 }
598             } else {
599                 if (key.u.addMask & (1 << j)) {
600                     buf = addVQADD_S32(buf, 8+j, 12+j, 4+j);
601                 }
602             }
603         }
604 
605         // If we have a dot product, perform the special pack.
606         if (key.u.dot) {
607             ADD_CHUNK(pack_u8_1);
608             ADD_CHUNK(dot);
609         } else {
610             switch(key.u.outVecSize) {
611             case 3:
612                 if (key.u.copyAlpha) {
613                     ADD_CHUNK(pack_u8_3);
614                 } else {
615                     ADD_CHUNK(pack_u8_4);
616                 }
617                 break;
618             case 2:
619                 ADD_CHUNK(pack_u8_3);
620                 break;
621             case 1:
622                 ADD_CHUNK(pack_u8_2);
623                 break;
624             case 0:
625                 ADD_CHUNK(pack_u8_1);
626                 break;
627             }
628         }
629 
630         // Write out result
631         switch(key.u.outVecSize) {
632         case 3:
633         case 2:
634             ADD_CHUNK(store_u8_4);
635             break;
636         case 1:
637             ADD_CHUNK(store_u8_2);
638             break;
639         case 0:
640             ADD_CHUNK(store_u8_1);
641             break;
642         }
643     }
644 
645     if (key.u.inType != key.u.outType) {
646         key.u.copyAlpha = 0;
647         key.u.dot = 0;
648     }
649 
650     // Loop, branch, and cleanup
651     ADD_CHUNK(postfix1);
652     buf = addBranch(buf, buf2, 0x01);
653     ADD_CHUNK(postfix2);
654 
655     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
656     if (ret == -1) {
657         ALOGE("mprotect error %i", ret);
658         return false;
659     }
660 
661     cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
662     return true;
663 #else
664     return false;
665 #endif
666 }
667 
updateCoeffCache(float fpMul,float addMul)668 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
669     for(int ct=0; ct < 16; ct++) {
670         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
671         tmpFp[ct] = fp[ct] * fpMul;
672         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
673     }
674 
675     float add = 0.f;
676     if (fpMul > 254.f) add = 0.5f;
677     for(int ct=0; ct < 4; ct++) {
678         tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
679         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
680         tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
681         tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
682         tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
683     }
684 
685     for(int ct=0; ct < 4; ct++) {
686         ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
687         ipa[ct * 4 + 1] = ipa[ct * 4];
688         ipa[ct * 4 + 2] = ipa[ct * 4];
689         ipa[ct * 4 + 3] = ipa[ct * 4];
690     }
691 }
692 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)693 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
694                                                     size_t dataLength) {
695     switch(slot) {
696     case 0:
697         memcpy (fp, data, sizeof(fp));
698         break;
699     case 1:
700         memcpy (fpa, data, sizeof(fpa));
701         break;
702     default:
703         rsAssert(0);
704         break;
705     }
706     mRootPtr = &kernel;
707 }
708 
709 
One(const RsForEachStubParamStruct * p,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)710 static void One(const RsForEachStubParamStruct *p, void *out,
711                 const void *py, const float* coeff, const float *add,
712                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
713 
714     float4 f = 0.f;
715     if (fin) {
716         switch(vsin) {
717         case 3:
718             f = ((const float4 *)py)[0];
719             break;
720         case 2:
721             f = ((const float4 *)py)[0];
722             f.w = 0.f;
723             break;
724         case 1:
725             f.xy = ((const float2 *)py)[0];
726             break;
727         case 0:
728             f.x = ((const float *)py)[0];
729             break;
730         }
731     } else {
732         switch(vsin) {
733         case 3:
734             f = convert_float4(((const uchar4 *)py)[0]);
735             break;
736         case 2:
737             f = convert_float4(((const uchar4 *)py)[0]);
738             f.w = 0.f;
739             break;
740         case 1:
741             f.xy = convert_float2(((const uchar2 *)py)[0]);
742             break;
743         case 0:
744             f.x = (float)(((const uchar *)py)[0]);
745             break;
746         }
747     }
748     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
749 
750     float4 sum;
751     sum.x = f.x * coeff[0] +
752             f.y * coeff[4] +
753             f.z * coeff[8] +
754             f.w * coeff[12];
755     sum.y = f.x * coeff[1] +
756             f.y * coeff[5] +
757             f.z * coeff[9] +
758             f.w * coeff[13];
759     sum.z = f.x * coeff[2] +
760             f.y * coeff[6] +
761             f.z * coeff[10] +
762             f.w * coeff[14];
763     sum.w = f.x * coeff[3] +
764             f.y * coeff[7] +
765             f.z * coeff[11] +
766             f.w * coeff[15];
767     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
768 
769     sum.x += add[0];
770     sum.y += add[4];
771     sum.z += add[8];
772     sum.w += add[12];
773 
774 
775     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
776     if (fout) {
777         switch(vsout) {
778         case 3:
779         case 2:
780             ((float4 *)out)[0] = sum;
781             break;
782         case 1:
783             ((float2 *)out)[0] = sum.xy;
784             break;
785         case 0:
786             ((float *)out)[0] = sum.x;
787             break;
788         }
789     } else {
790         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
791         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
792         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
793         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
794 
795         switch(vsout) {
796         case 3:
797         case 2:
798             ((uchar4 *)out)[0] = convert_uchar4(sum);
799             break;
800         case 1:
801             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
802             break;
803         case 0:
804             ((uchar *)out)[0] = sum.x;
805             break;
806         }
807     }
808     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
809 }
810 
kernel(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)811 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
812                                               uint32_t xstart, uint32_t xend,
813                                               uint32_t instep, uint32_t outstep) {
814     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
815     uchar *out = (uchar *)p->out;
816     uchar *in = (uchar *)p->in;
817     uint32_t x1 = xstart;
818     uint32_t x2 = xend;
819 
820     uint32_t vsin = cp->mLastKey.u.inVecSize;
821     uint32_t vsout = cp->mLastKey.u.outVecSize;
822     bool floatIn = !!cp->mLastKey.u.inType;
823     bool floatOut = !!cp->mLastKey.u.outType;
824 
825     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
826 
827     if(x2 > x1) {
828         int32_t len = (x2 - x1) >> 2;
829         if((cp->mOptKernel != NULL) && (len > 0)) {
830             cp->mOptKernel(out, in, cp->ip, len);
831             x1 += len << 2;
832             out += outstep * (len << 2);
833             in += instep * (len << 2);
834         }
835 
836         while(x1 != x2) {
837             One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
838             out += outstep;
839             in += instep;
840             x1++;
841         }
842     }
843 }
844 
preLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)845 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
846         uint32_t slot, const Allocation * ain, Allocation * aout,
847         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
848 
849     const Element *ein = ain->mHal.state.type->getElement();
850     const Element *eout = aout->mHal.state.type->getElement();
851 
852     if (ein->getType() == eout->getType()) {
853         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
854             updateCoeffCache(1.f, 255.f);
855         } else {
856             updateCoeffCache(1.f, 1.f);
857         }
858     } else {
859         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
860             updateCoeffCache(255.f, 255.f);
861         } else {
862             updateCoeffCache(1.f / 255.f, 1.f);
863         }
864     }
865 
866     Key_t key = computeKey(ain->mHal.state.type->getElement(),
867                            aout->mHal.state.type->getElement());
868     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
869         if (mBuf) munmap(mBuf, mBufSize);
870         mBuf = NULL;
871         mOptKernel = NULL;
872         if (build(key)) {
873             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
874             mLastKey = key;
875         }
876     }
877 }
878 
postLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)879 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
880         uint32_t slot, const Allocation * ain, Allocation * aout,
881         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
882 
883 }
884 
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)885 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
886             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
887             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
888 
889     mLastKey.key = 0;
890     mBuf = NULL;
891     mBufSize = 0;
892     mOptKernel = NULL;
893     const static float defaultMatrix[] = {
894         1.f, 0.f, 0.f, 0.f,
895         0.f, 1.f, 0.f, 0.f,
896         0.f, 0.f, 1.f, 0.f,
897         0.f, 0.f, 0.f, 1.f
898     };
899     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
900     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
901     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
902 }
903 
~RsdCpuScriptIntrinsicColorMatrix()904 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
905     if (mBuf) munmap(mBuf, mBufSize);
906     mBuf = NULL;
907     mOptKernel = NULL;
908 }
909 
populateScript(Script * s)910 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
911     s->mHal.info.exportedVariableCount = 2;
912 }
913 
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)914 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
915                                             const Script *s, const Element *e) {
916 
917     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
918 }
919 
920 
921 
922