1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <sys/mman.h>
18 #include <unistd.h>
19
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 #include "linkloader/include/MemChunk.h"
23
24 #include <sys/mman.h>
25 #include <stddef.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 //#include <utils/StopWatch.h>
29
30
31 /* uint kernel
32 * Q0 D0: Load slot for R
33 * D1: Load slot for G
34 * Q1 D2: Load slot for B
35 * D3: Load slot for A
36 * Q2 D4: Matrix
37 * D5: =
38 * Q3 D6: =
39 * D7: =
40 * Q4 D8: Add R
41 * D9:
42 * Q5 D10: Add G
43 * D11:
44 * Q6 D12: Add B
45 * D13:
46 * Q7 D14: Add A
47 * D15:
48 * Q8 D16: I32: R Sum
49 * D17:
50 * Q9 D18: I32: G Sum
51 * D19:
52 * Q10 D20: I32: B Sum
53 * D21:
54 * Q11 D22: I32: A Sum
55 * D23:
56 * Q12 D24: U16: expanded R
57 * D25:
58 * Q13 D26: U16: expanded G
59 * D27:
60 * Q14 D28: U16: expanded B
61 * D29:
62 * Q15 D30: U16: expanded A
63 * D31:
64 *
65 */
66
67 /* float kernel
68 * Q0 D0: Load slot for R
69 * D1: =
70 * Q1 D2: Load slot for G
71 * D3: =
72 * Q2 D4: Load slot for B
73 * D5: =
74 * Q3 D6: Load slot for A
75 * D7: =
76 * Q4 D8: Matrix
77 * D9: =
78 * Q5 D10: =
79 * D11: =
80 * Q6 D12: =
81 * D13: =
82 * Q7 D14: =
83 * D15: =
84 * Q8 D16: Add R
85 * D17: =
86 * Q9 D18: Add G
87 * D19: =
88 * Q10 D20: Add B
89 * D21: =
90 * Q11 D22: Add A
91 * D23: =
92 * Q12 D24: Sum R
93 * D25: =
94 * Q13 D26: Sum G
95 * D27: =
96 * Q14 D28: Sum B
97 * D29: =
98 * Q15 D30: Sum A
99 * D31: =
100 *
101 */
102
103
104
105 using namespace android;
106 using namespace android::renderscript;
107
108 namespace android {
109 namespace renderscript {
110
111 typedef union {
112 uint64_t key;
113 struct {
114 uint32_t inVecSize :2; // [0 - 1]
115 uint32_t outVecSize :2; // [2 - 3]
116 uint32_t inType :4; // [4 - 7]
117 uint32_t outType :4; // [8 - 11]
118 uint32_t dot :1; // [12]
119 uint32_t _unused1 :1; // [13]
120 uint32_t copyAlpha :1; // [14]
121 uint32_t _unused2 :1; // [15]
122 uint32_t coeffMask :16; // [16-31]
123 uint32_t addMask :4; // [32-35]
124 } u;
125 } Key_t;
126
127 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
128 public:
129 virtual void populateScript(Script *);
130
131 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
132
133 virtual ~RsdCpuScriptIntrinsicColorMatrix();
134 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
135
136 virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
137 const void * usr, uint32_t usrLen, const RsScriptCall *sc);
138 virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
139 const void * usr, uint32_t usrLen, const RsScriptCall *sc);
140
141 protected:
142 float fp[16];
143 float fpa[4];
144
145 // The following four fields are read as constants
146 // by the SIMD assembly code.
147 short ip[16];
148 int ipa[16];
149 float tmpFp[16];
150 float tmpFpa[16];
151
152 static void kernel(const RsForEachStubParamStruct *p,
153 uint32_t xstart, uint32_t xend,
154 uint32_t instep, uint32_t outstep);
155 void updateCoeffCache(float fpMul, float addMul);
156
157 Key_t mLastKey;
158 unsigned char *mBuf;
159 size_t mBufSize;
160
161 Key_t computeKey(const Element *ein, const Element *eout);
162
163 bool build(Key_t key);
164
165 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
166
167 };
168
169 }
170 }
171
172
computeKey(const Element * ein,const Element * eout)173 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
174 const Element *ein, const Element *eout) {
175
176 Key_t key;
177 key.key = 0;
178
179 // Compute a unique code key for this operation
180
181 // Add to the key the input and output types
182 bool hasFloat = false;
183 if (ein->getType() == RS_TYPE_FLOAT_32) {
184 hasFloat = true;
185 key.u.inType = RS_TYPE_FLOAT_32;
186 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
187 }
188 if (eout->getType() == RS_TYPE_FLOAT_32) {
189 hasFloat = true;
190 key.u.outType = RS_TYPE_FLOAT_32;
191 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
192 }
193
194 // Mask in the bits indicating which coefficients in the
195 // color matrix are needed.
196 if (hasFloat) {
197 for (uint32_t i=0; i < 16; i++) {
198 if (fabs(fp[i]) != 0.f) {
199 key.u.coeffMask |= 1 << i;
200 }
201 }
202 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
203 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
204 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
205 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
206
207 } else {
208 for (uint32_t i=0; i < 16; i++) {
209 if (ip[i] != 0) {
210 key.u.coeffMask |= 1 << i;
211 }
212 }
213 if (ipa[0] != 0) key.u.addMask |= 0x1;
214 if (ipa[4] != 0) key.u.addMask |= 0x2;
215 if (ipa[8] != 0) key.u.addMask |= 0x4;
216 if (ipa[12] != 0) key.u.addMask |= 0x8;
217 }
218
219 // Look for a dot product where the r,g,b colums are the same
220 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
221 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
222 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
223 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
224
225 if (!key.u.addMask) key.u.dot = 1;
226 }
227
228 // Is alpha a simple copy
229 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
230 key.u.copyAlpha = !(key.u.inType || key.u.outType);
231 }
232
233 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
234
235 switch (ein->getVectorSize()) {
236 case 4:
237 key.u.inVecSize = 3;
238 break;
239 case 3:
240 key.u.inVecSize = 2;
241 key.u.coeffMask &= ~0xF000;
242 break;
243 case 2:
244 key.u.inVecSize = 1;
245 key.u.coeffMask &= ~0xFF00;
246 break;
247 default:
248 key.u.coeffMask &= ~0xFFF0;
249 break;
250 }
251
252 switch (eout->getVectorSize()) {
253 case 4:
254 key.u.outVecSize = 3;
255 break;
256 case 3:
257 key.u.outVecSize = 2;
258 key.u.coeffMask &= ~0x8888;
259 break;
260 case 2:
261 key.u.outVecSize = 1;
262 key.u.coeffMask &= ~0xCCCC;
263 break;
264 default:
265 key.u.coeffMask &= ~0xEEEE;
266 break;
267 }
268
269 if (key.u.inType && !key.u.outType) {
270 key.u.addMask |= 1;
271 if (key.u.outVecSize > 0) key.u.addMask |= 2;
272 if (key.u.outVecSize > 1) key.u.addMask |= 4;
273 if (key.u.outVecSize > 2) key.u.addMask |= 8;
274 }
275
276 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
277 return key;
278 }
279
280 #if defined(ARCH_ARM_HAVE_NEON)
281
282 #define DEF_SYM(x) \
283 extern "C" uint32_t _N_ColorMatrix_##x; \
284 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
285 extern "C" uint32_t _N_ColorMatrix_##x##_len;
286
287 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)288 DEF_SYM(prefix_f)
289 DEF_SYM(postfix1)
290 DEF_SYM(postfix2)
291
292 DEF_SYM(load_u8_4)
293 DEF_SYM(load_u8_3)
294 DEF_SYM(load_u8_2)
295 DEF_SYM(load_u8_1)
296 DEF_SYM(load_u8f_4)
297 DEF_SYM(load_u8f_3)
298 DEF_SYM(load_u8f_2)
299 DEF_SYM(load_u8f_1)
300 DEF_SYM(load_f32_4)
301 DEF_SYM(load_f32_3)
302 DEF_SYM(load_f32_2)
303 DEF_SYM(load_f32_1)
304
305 DEF_SYM(store_u8_4)
306 DEF_SYM(store_u8_2)
307 DEF_SYM(store_u8_1)
308 DEF_SYM(store_f32_4)
309 DEF_SYM(store_f32_3)
310 DEF_SYM(store_f32_2)
311 DEF_SYM(store_f32_1)
312 DEF_SYM(store_f32u_4)
313 DEF_SYM(store_f32u_2)
314 DEF_SYM(store_f32u_1)
315
316 DEF_SYM(unpack_u8_4)
317 DEF_SYM(unpack_u8_3)
318 DEF_SYM(unpack_u8_2)
319 DEF_SYM(unpack_u8_1)
320 DEF_SYM(pack_u8_4)
321 DEF_SYM(pack_u8_3)
322 DEF_SYM(pack_u8_2)
323 DEF_SYM(pack_u8_1)
324 DEF_SYM(dot)
325 DEF_SYM(add_0_u8)
326 DEF_SYM(add_1_u8)
327 DEF_SYM(add_2_u8)
328 DEF_SYM(add_3_u8)
329
330 #define ADD_CHUNK(x) \
331 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
332 buf += _N_ColorMatrix_##x##_len
333
334
335 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
336 size_t off = (target - buf - 8) >> 2;
337 rsAssert(((off & 0xff000000) == 0) ||
338 ((off & 0xff000000) == 0xff000000));
339
340 uint32_t op = (condition << 28);
341 op |= 0xa << 24; // branch
342 op |= 0xffffff & off;
343 ((uint32_t *)buf)[0] = op;
344 return buf + 4;
345 }
346
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)347 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
348 rsAssert(vd < 32);
349 rsAssert(vm < 32);
350 rsAssert(vn < 32);
351
352 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
353 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
354 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
355 return op;
356 }
357
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)358 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
359 //vmlal.s16 Q#1, D#1, D#2[#]
360 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
361 ((uint32_t *)buf)[0] = op;
362 return buf + 4;
363 }
364
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)365 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
366 //vmull.s16 Q#1, D#1, D#2[#]
367 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
368 ((uint32_t *)buf)[0] = op;
369 return buf + 4;
370 }
371
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)372 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
373 //vqadd.s32 Q#1, D#1, D#2
374 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
375 ((uint32_t *)buf)[0] = op;
376 return buf + 4;
377 }
378
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)379 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
380 //vmlal.f32 Q#1, D#1, D#2[#]
381 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
382 ((uint32_t *)buf)[0] = op;
383 return buf + 4;
384 }
385
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)386 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
387 //vmull.f32 Q#1, D#1, D#2[#]
388 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
389 ((uint32_t *)buf)[0] = op;
390 return buf + 4;
391 }
392
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)393 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
394 //vadd.f32 Q#1, D#1, D#2
395 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
396 ((uint32_t *)buf)[0] = op;
397 return buf + 4;
398 }
399
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)400 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
401 //vadd.f32 Q#1, D#1, D#2
402 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
403 ((uint32_t *)buf)[0] = op;
404 return buf + 4;
405 }
406 #endif
407
408
build(Key_t key)409 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
410 #if defined(ARCH_ARM_HAVE_NEON)
411 mBufSize = 4096;
412 //StopWatch build_time("rs cm: build time");
413 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
414 MAP_PRIVATE | MAP_ANON, -1, 0);
415 if (!mBuf) {
416 return false;
417 }
418
419 uint8_t *buf = mBuf;
420 uint8_t *buf2 = NULL;
421
422 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
423 int opInit[4] = {0, 0, 0, 0};
424
425 memset(ops, 0, sizeof(ops));
426 for (int i=0; i < 4; i++) {
427 if (key.u.coeffMask & (1 << (i*4))) {
428 ops[i][0] = 0x2 | opInit[0];
429 opInit[0] = 1;
430 }
431 if (!key.u.dot) {
432 if (key.u.coeffMask & (1 << (1 + i*4))) {
433 ops[i][1] = 0x2 | opInit[1];
434 opInit[1] = 1;
435 }
436 if (key.u.coeffMask & (1 << (2 + i*4))) {
437 ops[i][2] = 0x2 | opInit[2];
438 opInit[2] = 1;
439 }
440 }
441 if (!key.u.copyAlpha) {
442 if (key.u.coeffMask & (1 << (3 + i*4))) {
443 ops[i][3] = 0x2 | opInit[3];
444 opInit[3] = 1;
445 }
446 }
447 }
448
449 if (key.u.inType || key.u.outType) {
450 key.u.copyAlpha = 0;
451 ADD_CHUNK(prefix_f);
452 buf2 = buf;
453
454 // Load the incoming r,g,b,a as needed
455 if (key.u.inType) {
456 switch(key.u.inVecSize) {
457 case 3:
458 ADD_CHUNK(load_f32_4);
459 break;
460 case 2:
461 ADD_CHUNK(load_f32_3);
462 break;
463 case 1:
464 ADD_CHUNK(load_f32_2);
465 break;
466 case 0:
467 ADD_CHUNK(load_f32_1);
468 break;
469 }
470 } else {
471 switch(key.u.inVecSize) {
472 case 3:
473 ADD_CHUNK(load_u8f_4);
474 break;
475 case 2:
476 ADD_CHUNK(load_u8f_3);
477 break;
478 case 1:
479 ADD_CHUNK(load_u8f_2);
480 break;
481 case 0:
482 ADD_CHUNK(load_u8f_1);
483 break;
484 }
485 }
486
487 for (int i=0; i < 4; i++) {
488 for (int j=0; j < 4; j++) {
489 switch(ops[i][j]) {
490 case 0:
491 break;
492 case 2:
493 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
494 break;
495 case 3:
496 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
497 break;
498 }
499 }
500 }
501 for (int j=0; j < 4; j++) {
502 if (opInit[j]) {
503 if (key.u.addMask & (1 << j)) {
504 buf = addVADD_F32(buf, j, 12+j, 8+j);
505 } else {
506 buf = addVORR_32(buf, j, 12+j, 12+j);
507 }
508 } else {
509 if (key.u.addMask & (1 << j)) {
510 buf = addVADD_F32(buf, j, j, 8+j);
511 }
512 }
513 }
514
515 if (key.u.outType) {
516 switch(key.u.outVecSize) {
517 case 3:
518 ADD_CHUNK(store_f32_4);
519 break;
520 case 2:
521 ADD_CHUNK(store_f32_3);
522 break;
523 case 1:
524 ADD_CHUNK(store_f32_2);
525 break;
526 case 0:
527 ADD_CHUNK(store_f32_1);
528 break;
529 }
530 } else {
531 switch(key.u.outVecSize) {
532 case 3:
533 case 2:
534 ADD_CHUNK(store_f32u_4);
535 break;
536 case 1:
537 ADD_CHUNK(store_f32u_2);
538 break;
539 case 0:
540 ADD_CHUNK(store_f32u_1);
541 break;
542 }
543 }
544
545
546 } else {
547 // Add the function prefix
548 // Store the address for the loop return
549 ADD_CHUNK(prefix_i);
550 buf2 = buf;
551
552 // Load the incoming r,g,b,a as needed
553 switch(key.u.inVecSize) {
554 case 3:
555 ADD_CHUNK(load_u8_4);
556 if (key.u.copyAlpha) {
557 ADD_CHUNK(unpack_u8_3);
558 } else {
559 ADD_CHUNK(unpack_u8_4);
560 }
561 break;
562 case 2:
563 ADD_CHUNK(load_u8_3);
564 ADD_CHUNK(unpack_u8_3);
565 break;
566 case 1:
567 ADD_CHUNK(load_u8_2);
568 ADD_CHUNK(unpack_u8_2);
569 break;
570 case 0:
571 ADD_CHUNK(load_u8_1);
572 ADD_CHUNK(unpack_u8_1);
573 break;
574 }
575
576 // Add multiply and accumulate
577 // use MULL to init the output register,
578 // use MLAL from there
579 for (int i=0; i < 4; i++) {
580 for (int j=0; j < 4; j++) {
581 switch(ops[i][j]) {
582 case 0:
583 break;
584 case 2:
585 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
586 break;
587 case 3:
588 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
589 break;
590 }
591 }
592 }
593 for (int j=0; j < 4; j++) {
594 if (opInit[j]) {
595 if (key.u.addMask & (1 << j)) {
596 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
597 }
598 } else {
599 if (key.u.addMask & (1 << j)) {
600 buf = addVQADD_S32(buf, 8+j, 12+j, 4+j);
601 }
602 }
603 }
604
605 // If we have a dot product, perform the special pack.
606 if (key.u.dot) {
607 ADD_CHUNK(pack_u8_1);
608 ADD_CHUNK(dot);
609 } else {
610 switch(key.u.outVecSize) {
611 case 3:
612 if (key.u.copyAlpha) {
613 ADD_CHUNK(pack_u8_3);
614 } else {
615 ADD_CHUNK(pack_u8_4);
616 }
617 break;
618 case 2:
619 ADD_CHUNK(pack_u8_3);
620 break;
621 case 1:
622 ADD_CHUNK(pack_u8_2);
623 break;
624 case 0:
625 ADD_CHUNK(pack_u8_1);
626 break;
627 }
628 }
629
630 // Write out result
631 switch(key.u.outVecSize) {
632 case 3:
633 case 2:
634 ADD_CHUNK(store_u8_4);
635 break;
636 case 1:
637 ADD_CHUNK(store_u8_2);
638 break;
639 case 0:
640 ADD_CHUNK(store_u8_1);
641 break;
642 }
643 }
644
645 if (key.u.inType != key.u.outType) {
646 key.u.copyAlpha = 0;
647 key.u.dot = 0;
648 }
649
650 // Loop, branch, and cleanup
651 ADD_CHUNK(postfix1);
652 buf = addBranch(buf, buf2, 0x01);
653 ADD_CHUNK(postfix2);
654
655 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
656 if (ret == -1) {
657 ALOGE("mprotect error %i", ret);
658 return false;
659 }
660
661 cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
662 return true;
663 #else
664 return false;
665 #endif
666 }
667
updateCoeffCache(float fpMul,float addMul)668 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
669 for(int ct=0; ct < 16; ct++) {
670 ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
671 tmpFp[ct] = fp[ct] * fpMul;
672 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
673 }
674
675 float add = 0.f;
676 if (fpMul > 254.f) add = 0.5f;
677 for(int ct=0; ct < 4; ct++) {
678 tmpFpa[ct * 4 + 0] = fpa[ct] * addMul + add;
679 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
680 tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
681 tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
682 tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
683 }
684
685 for(int ct=0; ct < 4; ct++) {
686 ipa[ct * 4 + 0] = (int)(fpa[ct] * 65536.f + 0.5f);
687 ipa[ct * 4 + 1] = ipa[ct * 4];
688 ipa[ct * 4 + 2] = ipa[ct * 4];
689 ipa[ct * 4 + 3] = ipa[ct * 4];
690 }
691 }
692
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)693 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
694 size_t dataLength) {
695 switch(slot) {
696 case 0:
697 memcpy (fp, data, sizeof(fp));
698 break;
699 case 1:
700 memcpy (fpa, data, sizeof(fpa));
701 break;
702 default:
703 rsAssert(0);
704 break;
705 }
706 mRootPtr = &kernel;
707 }
708
709
One(const RsForEachStubParamStruct * p,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)710 static void One(const RsForEachStubParamStruct *p, void *out,
711 const void *py, const float* coeff, const float *add,
712 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
713
714 float4 f = 0.f;
715 if (fin) {
716 switch(vsin) {
717 case 3:
718 f = ((const float4 *)py)[0];
719 break;
720 case 2:
721 f = ((const float4 *)py)[0];
722 f.w = 0.f;
723 break;
724 case 1:
725 f.xy = ((const float2 *)py)[0];
726 break;
727 case 0:
728 f.x = ((const float *)py)[0];
729 break;
730 }
731 } else {
732 switch(vsin) {
733 case 3:
734 f = convert_float4(((const uchar4 *)py)[0]);
735 break;
736 case 2:
737 f = convert_float4(((const uchar4 *)py)[0]);
738 f.w = 0.f;
739 break;
740 case 1:
741 f.xy = convert_float2(((const uchar2 *)py)[0]);
742 break;
743 case 0:
744 f.x = (float)(((const uchar *)py)[0]);
745 break;
746 }
747 }
748 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
749
750 float4 sum;
751 sum.x = f.x * coeff[0] +
752 f.y * coeff[4] +
753 f.z * coeff[8] +
754 f.w * coeff[12];
755 sum.y = f.x * coeff[1] +
756 f.y * coeff[5] +
757 f.z * coeff[9] +
758 f.w * coeff[13];
759 sum.z = f.x * coeff[2] +
760 f.y * coeff[6] +
761 f.z * coeff[10] +
762 f.w * coeff[14];
763 sum.w = f.x * coeff[3] +
764 f.y * coeff[7] +
765 f.z * coeff[11] +
766 f.w * coeff[15];
767 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
768
769 sum.x += add[0];
770 sum.y += add[4];
771 sum.z += add[8];
772 sum.w += add[12];
773
774
775 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
776 if (fout) {
777 switch(vsout) {
778 case 3:
779 case 2:
780 ((float4 *)out)[0] = sum;
781 break;
782 case 1:
783 ((float2 *)out)[0] = sum.xy;
784 break;
785 case 0:
786 ((float *)out)[0] = sum.x;
787 break;
788 }
789 } else {
790 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
791 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
792 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
793 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
794
795 switch(vsout) {
796 case 3:
797 case 2:
798 ((uchar4 *)out)[0] = convert_uchar4(sum);
799 break;
800 case 1:
801 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
802 break;
803 case 0:
804 ((uchar *)out)[0] = sum.x;
805 break;
806 }
807 }
808 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
809 }
810
kernel(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)811 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
812 uint32_t xstart, uint32_t xend,
813 uint32_t instep, uint32_t outstep) {
814 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
815 uchar *out = (uchar *)p->out;
816 uchar *in = (uchar *)p->in;
817 uint32_t x1 = xstart;
818 uint32_t x2 = xend;
819
820 uint32_t vsin = cp->mLastKey.u.inVecSize;
821 uint32_t vsout = cp->mLastKey.u.outVecSize;
822 bool floatIn = !!cp->mLastKey.u.inType;
823 bool floatOut = !!cp->mLastKey.u.outType;
824
825 //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
826
827 if(x2 > x1) {
828 int32_t len = (x2 - x1) >> 2;
829 if((cp->mOptKernel != NULL) && (len > 0)) {
830 cp->mOptKernel(out, in, cp->ip, len);
831 x1 += len << 2;
832 out += outstep * (len << 2);
833 in += instep * (len << 2);
834 }
835
836 while(x1 != x2) {
837 One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
838 out += outstep;
839 in += instep;
840 x1++;
841 }
842 }
843 }
844
preLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)845 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
846 uint32_t slot, const Allocation * ain, Allocation * aout,
847 const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
848
849 const Element *ein = ain->mHal.state.type->getElement();
850 const Element *eout = aout->mHal.state.type->getElement();
851
852 if (ein->getType() == eout->getType()) {
853 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
854 updateCoeffCache(1.f, 255.f);
855 } else {
856 updateCoeffCache(1.f, 1.f);
857 }
858 } else {
859 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
860 updateCoeffCache(255.f, 255.f);
861 } else {
862 updateCoeffCache(1.f / 255.f, 1.f);
863 }
864 }
865
866 Key_t key = computeKey(ain->mHal.state.type->getElement(),
867 aout->mHal.state.type->getElement());
868 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
869 if (mBuf) munmap(mBuf, mBufSize);
870 mBuf = NULL;
871 mOptKernel = NULL;
872 if (build(key)) {
873 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
874 mLastKey = key;
875 }
876 }
877 }
878
postLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)879 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
880 uint32_t slot, const Allocation * ain, Allocation * aout,
881 const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
882
883 }
884
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)885 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
886 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
887 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
888
889 mLastKey.key = 0;
890 mBuf = NULL;
891 mBufSize = 0;
892 mOptKernel = NULL;
893 const static float defaultMatrix[] = {
894 1.f, 0.f, 0.f, 0.f,
895 0.f, 1.f, 0.f, 0.f,
896 0.f, 0.f, 1.f, 0.f,
897 0.f, 0.f, 0.f, 1.f
898 };
899 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
900 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
901 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
902 }
903
~RsdCpuScriptIntrinsicColorMatrix()904 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
905 if (mBuf) munmap(mBuf, mBufSize);
906 mBuf = NULL;
907 mOptKernel = NULL;
908 }
909
populateScript(Script * s)910 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
911 s->mHal.info.exportedVariableCount = 2;
912 }
913
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)914 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
915 const Script *s, const Element *e) {
916
917 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
918 }
919
920
921
922