1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "RenderScriptToolkit.h"
18 #include "TaskProcessor.h"
19 #include "Utils.h"
20 #include <cassert>
21 #include <cstdint>
22 #include <sys/mman.h>
23
24 namespace renderscript {
25
26 #define LOG_TAG "renderscript.toolkit.ColorMatrix"
27
28 /* uint kernel
29 * Q0 D0: Load slot for R
30 * D1: Load slot for G
31 * Q1 D2: Load slot for B
32 * D3: Load slot for A
33 * Q2 D4: Matrix
34 * D5: =
35 * Q3 D6: =
36 * D7: =
37 * Q4 D8: Add R
38 * D9:
39 * Q5 D10: Add G
40 * D11:
41 * Q6 D12: Add B
42 * D13:
43 * Q7 D14: Add A
44 * D15:
45 * Q8 D16: I32: R Sum
46 * D17:
47 * Q9 D18: I32: G Sum
48 * D19:
49 * Q10 D20: I32: B Sum
50 * D21:
51 * Q11 D22: I32: A Sum
52 * D23:
53 * Q12 D24: U16: expanded R
54 * D25:
55 * Q13 D26: U16: expanded G
56 * D27:
57 * Q14 D28: U16: expanded B
58 * D29:
59 * Q15 D30: U16: expanded A
60 * D31:
61 *
62 */
63
64 /* float kernel
65 * Q0 D0: Load slot for R
66 * D1: =
67 * Q1 D2: Load slot for G
68 * D3: =
69 * Q2 D4: Load slot for B
70 * D5: =
71 * Q3 D6: Load slot for A
72 * D7: =
73 * Q4 D8: Matrix
74 * D9: =
75 * Q5 D10: =
76 * D11: =
77 * Q6 D12: =
78 * D13: =
79 * Q7 D14: =
80 * D15: =
81 * Q8 D16: Add R
82 * D17: =
83 * Q9 D18: Add G
84 * D19: =
85 * Q10 D20: Add B
86 * D21: =
87 * Q11 D22: Add A
88 * D23: =
89 * Q12 D24: Sum R
90 * D25: =
91 * Q13 D26: Sum G
92 * D27: =
93 * Q14 D28: Sum B
94 * D29: =
95 * Q15 D30: Sum A
96 * D31: =
97 *
98 */
99
100 typedef union {
101 uint64_t key;
102 struct {
103 uint32_t inVecSize :2; // [0 - 1]
104 uint32_t outVecSize :2; // [2 - 3]
105 uint32_t inType :4; // [4 - 7]
106 uint32_t outType :4; // [8 - 11]
107 uint32_t dot :1; // [12]
108 uint32_t _unused1 :1; // [13]
109 uint32_t copyAlpha :1; // [14]
110 uint32_t _unused2 :1; // [15]
111 uint32_t coeffMask :16; // [16-31]
112 uint32_t addMask :4; // [32-35]
113 } u;
114 } Key_t;
115
116 /* The two data types and their value, as specified in the RenderScript documentation.
117 * Only RS_TYPE_UNSIGNED_8 is currently supported.
118 *
119 * TODO: The actual values of these constants are likely not important. We may be
120 * able to simplify the key related code.
121 */
122 const int RS_TYPE_UNSIGNED_8 = 8;
123 const int RS_TYPE_FLOAT_32 = 2;
124
125 //Re-enable when intrinsic is fixed
126 #if defined(ARCH_ARM64_USE_INTRINSICS)
127 typedef struct {
128 void (*column[4])();
129 void (*store)();
130 void (*load)();
131 void (*store_end)();
132 void (*load_end)();
133 } FunctionTab_t;
134
135 extern "C" void rsdIntrinsicColorMatrix_int_K(
136 void *out, void const *in, size_t count,
137 FunctionTab_t const *fns,
138 int16_t const *mult, int32_t const *add);
139
140 extern "C" void rsdIntrinsicColorMatrix_float_K(
141 void *out, void const *in, size_t count,
142 FunctionTab_t const *fns,
143 float const *mult, float const *add);
144
145 /* The setup functions fill in function tables to be used by above functions;
146 * this code also eliminates jump-to-another-jump cases by short-circuiting
147 * empty functions. While it's not performance critical, it works out easier
148 * to write the set-up code in assembly than to try to expose the same symbols
149 * and write the code in C.
150 */
151 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
152 FunctionTab_t *fns,
153 uint32_t mask, int dt, int st);
154
155 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
156 FunctionTab_t *fns,
157 uint32_t mask, int dt, int st);
158 #endif // ARCH_ARM64_USE_INTRINSICS
159
160 class ColorMatrixTask : public Task {
161 const void* mIn;
162 void* mOut;
163 size_t mInputVectorSize;
164 uint32_t mOutstep;
165 uint32_t mInstep;
166
167 float mFp[16];
168 float mFpa[4];
169
170 // The following four fields are read as constants
171 // by the SIMD assembly code.
172 int16_t mIp[16];
173 int mIpa[4];
174 float mTmpFp[16];
175 float mTmpFpa[4];
176 #if defined(ARCH_ARM64_USE_INTRINSICS)
177 FunctionTab_t mFnTab;
178 #endif
179
180 void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
181 void updateCoeffCache(float fpMul, float addMul);
182
183 Key_t mLastKey;
184 unsigned char* mBuf;
185 size_t mBufSize;
186
187 bool build(Key_t key);
188 void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
189
190 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
191 Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
192 void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
193 #else
194 Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
195 void preLaunch(size_t inVectorSize, size_t outVectorSize);
196 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
197
198 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
199 void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
200 size_t endY) override;
201
202 public:
ColorMatrixTask(const void * in,void * out,size_t inputVectorSize,size_t outputVectorSize,size_t sizeX,size_t sizeY,const float * matrix,const float * addVector,const Restriction * restriction)203 ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
204 size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
205 const Restriction* restriction)
206 : Task{sizeX, sizeY, outputVectorSize, true, restriction},
207 mIn{in},
208 mOut{out},
209 mInputVectorSize{inputVectorSize} {
210 mLastKey.key = 0;
211 mBuf = nullptr;
212 mBufSize = 0;
213 mOptKernel = nullptr;
214
215 mOutstep = paddedSize(outputVectorSize);
216 mInstep = paddedSize(inputVectorSize);
217
218 memcpy(mFp, matrix, sizeof(mFp));
219 memcpy(mFpa, addVector, sizeof(mFpa));
220 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
221 // For float support, we'll have to pass the type in the constructor too.
222 preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
223 #else
224 preLaunch(inputVectorSize, outputVectorSize);
225 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
226 }
~ColorMatrixTask()227 ~ColorMatrixTask() {
228 if (mBuf) munmap(mBuf, mBufSize);
229 mBuf = nullptr;
230 mOptKernel = nullptr;
231 }
232 };
233
234 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
computeKey(size_t inVectorSize,int inType,size_t outVectorSize,int outType)235 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
236 int outType) {
237 Key_t key;
238 key.key = 0;
239
240 // Compute a unique code key for this operation
241
242 // Add to the key the input and output types
243 bool hasFloat = false;
244 if (inType == RS_TYPE_FLOAT_32) {
245 hasFloat = true;
246 key.u.inType = RS_TYPE_FLOAT_32;
247 }
248 if (outType == RS_TYPE_FLOAT_32) {
249 hasFloat = true;
250 key.u.outType = RS_TYPE_FLOAT_32;
251 }
252
253 // Mask in the bits indicating which coefficients in the
254 // color matrix are needed.
255 if (hasFloat) {
256 for (uint32_t i=0; i < 16; i++) {
257 if (fabs(mFp[i]) != 0.f) {
258 key.u.coeffMask |= 1 << i;
259 }
260 }
261 if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
262 if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
263 if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
264 if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
265
266 } else {
267 #else
268 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
269 Key_t key;
270 key.key = 0;
271
272 // Compute a unique code key for this operation
273 {
274 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
275
276 for (uint32_t i=0; i < 16; i++) {
277 if (mIp[i] != 0) {
278 key.u.coeffMask |= 1 << i;
279 }
280 }
281 if (mIpa[0] != 0) key.u.addMask |= 0x1;
282 if (mIpa[1] != 0) key.u.addMask |= 0x2;
283 if (mIpa[2] != 0) key.u.addMask |= 0x4;
284 if (mIpa[3] != 0) key.u.addMask |= 0x8;
285 }
286
287 // Look for a dot product where the r,g,b colums are the same
288 if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
289 (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
290 (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
291 (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
292
293 if (!key.u.addMask) key.u.dot = 1;
294 }
295
296 // Is alpha a simple copy
297 if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
298 key.u.copyAlpha = !(key.u.inType || key.u.outType);
299 }
300
301 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
302
303 switch (inVectorSize) {
304 case 4:
305 key.u.inVecSize = 3;
306 break;
307 case 3:
308 key.u.inVecSize = 2;
309 key.u.coeffMask &= ~0xF000;
310 break;
311 case 2:
312 key.u.inVecSize = 1;
313 key.u.coeffMask &= ~0xFF00;
314 break;
315 default:
316 key.u.coeffMask &= ~0xFFF0;
317 break;
318 }
319
320 switch (outVectorSize) {
321 case 4:
322 key.u.outVecSize = 3;
323 break;
324 case 3:
325 key.u.outVecSize = 2;
326 key.u.coeffMask &= ~0x8888;
327 key.u.addMask &= 7;
328 break;
329 case 2:
330 key.u.outVecSize = 1;
331 key.u.coeffMask &= ~0xCCCC;
332 key.u.addMask &= 3;
333 break;
334 default:
335 key.u.coeffMask &= ~0xEEEE;
336 key.u.addMask &= 1;
337 break;
338 }
339
340 if (key.u.inType && !key.u.outType) {
341 key.u.addMask |= 1;
342 if (key.u.outVecSize > 0) key.u.addMask |= 2;
343 if (key.u.outVecSize > 1) key.u.addMask |= 4;
344 if (key.u.outVecSize > 2) key.u.addMask |= 8;
345 }
346
347 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
348 return key;
349 }
350
351 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
352
353 #define DEF_SYM(x) \
354 extern "C" uint32_t _N_ColorMatrix_##x; \
355 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
356 extern "C" uint32_t _N_ColorMatrix_##x##_len;
357
358 DEF_SYM(prefix_i)
359 DEF_SYM(prefix_f)
360 DEF_SYM(postfix1)
361 DEF_SYM(postfix2)
362
363 DEF_SYM(load_u8_4)
364 DEF_SYM(load_u8_3)
365 DEF_SYM(load_u8_2)
366 DEF_SYM(load_u8_1)
367 DEF_SYM(load_u8f_4)
368 DEF_SYM(load_u8f_3)
369 DEF_SYM(load_u8f_2)
370 DEF_SYM(load_u8f_1)
371
372 DEF_SYM(load_f32_4)
373 DEF_SYM(load_f32_3)
374 DEF_SYM(load_f32_2)
375 DEF_SYM(load_f32_1)
376
377 DEF_SYM(store_u8_4)
378 DEF_SYM(store_u8_2)
379 DEF_SYM(store_u8_1)
380
381 DEF_SYM(store_f32_4)
382 DEF_SYM(store_f32_3)
383 DEF_SYM(store_f32_2)
384 DEF_SYM(store_f32_1)
385 DEF_SYM(store_f32u_4)
386 DEF_SYM(store_f32u_2)
387 DEF_SYM(store_f32u_1)
388
389 DEF_SYM(unpack_u8_4)
390 DEF_SYM(unpack_u8_3)
391 DEF_SYM(unpack_u8_2)
392 DEF_SYM(unpack_u8_1)
393 DEF_SYM(pack_u8_4)
394 DEF_SYM(pack_u8_3)
395 DEF_SYM(pack_u8_2)
396 DEF_SYM(pack_u8_1)
397 DEF_SYM(dot)
398 DEF_SYM(add_0_u8)
399 DEF_SYM(add_1_u8)
400 DEF_SYM(add_2_u8)
401 DEF_SYM(add_3_u8)
402
403 #define ADD_CHUNK(x) \
404 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
405 buf += _N_ColorMatrix_##x##_len
406
407
408 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
409 size_t off = (target - buf - 8) >> 2;
410 assert(((off & 0xff000000) == 0) ||
411 ((off & 0xff000000) == 0xff000000));
412
413 uint32_t op = (condition << 28);
414 op |= 0xa << 24; // branch
415 op |= 0xffffff & off;
416 ((uint32_t *)buf)[0] = op;
417 return buf + 4;
418 }
419
420 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
421 assert(vd < 32);
422 assert(vm < 32);
423 assert(vn < 32);
424
425 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
426 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
427 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
428 return op;
429 }
430
431 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
432 uint32_t src_d2_s) {
433 //vmlal.s16 Q#1, D#1, D#2[#]
434 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
435 ((uint32_t *)buf)[0] = op;
436 return buf + 4;
437 }
438
439 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
440 uint32_t src_d2_s) {
441 //vmull.s16 Q#1, D#1, D#2[#]
442 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
443 ((uint32_t *)buf)[0] = op;
444 return buf + 4;
445 }
446
447 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448 //vqadd.s32 Q#1, Q#1, Q#2
449 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
450 ((uint32_t *)buf)[0] = op;
451 return buf + 4;
452 }
453
454 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
455 uint32_t src_d2_s) {
456 //vmlal.f32 Q#1, D#1, D#2[#]
457 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
458 ((uint32_t *)buf)[0] = op;
459 return buf + 4;
460 }
461
462 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
463 uint32_t src_d2_s) {
464 //vmull.f32 Q#1, D#1, D#2[#]
465 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
466 ((uint32_t *)buf)[0] = op;
467 return buf + 4;
468 }
469
470 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
471 //vadd.f32 Q#1, D#1, D#2
472 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
473 ((uint32_t *)buf)[0] = op;
474 return buf + 4;
475 }
476
477 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
478 //vmov.32 Q#1, #imm
479 assert(imm == 0);
480 (void) imm; // Avoid unused parameter warnings for non-debug builds
481 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
482 ((uint32_t *)buf)[0] = op;
483 return buf + 4;
484 }
485
486 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
487 //vadd.f32 Q#1, D#1, D#2
488 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
489 ((uint32_t *)buf)[0] = op;
490 return buf + 4;
491 }
492 #endif
493
494 #if defined(ARCH_X86_HAVE_SSSE3)
495 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
496 const int16_t *coef, uint32_t count);
497 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
498 const int16_t *coef, uint32_t count);
499 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
500 const int16_t *coef, uint32_t count);
501
502 void * selectKernel(Key_t key)
503 {
504 void * kernel = nullptr;
505
506 // inType, outType float if nonzero
507 if (!(key.u.inType || key.u.outType)) {
508 if (key.u.dot)
509 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
510 else if (key.u.copyAlpha)
511 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
512 else
513 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
514 }
515
516 return kernel;
517 }
518 #endif
519
520 bool ColorMatrixTask::build(Key_t key) {
521 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
522 mBufSize = 4096;
523 //StopWatch build_time("rs cm: build time");
524 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
525 MAP_PRIVATE | MAP_ANON, -1, 0);
526 if (mBuf == MAP_FAILED) {
527 mBuf = NULL;
528 return false;
529 }
530
531 uint8_t *buf = mBuf;
532 uint8_t *buf2 = nullptr;
533
534 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
535 int opInit[4] = {0, 0, 0, 0};
536
537 memset(ops, 0, sizeof(ops));
538 for (int i=0; i < 4; i++) {
539 if (key.u.coeffMask & (1 << (i*4))) {
540 ops[i][0] = 0x2 | opInit[0];
541 opInit[0] = 1;
542 }
543 if (!key.u.dot) {
544 if (key.u.coeffMask & (1 << (1 + i*4))) {
545 ops[i][1] = 0x2 | opInit[1];
546 opInit[1] = 1;
547 }
548 if (key.u.coeffMask & (1 << (2 + i*4))) {
549 ops[i][2] = 0x2 | opInit[2];
550 opInit[2] = 1;
551 }
552 }
553 if (!key.u.copyAlpha) {
554 if (key.u.coeffMask & (1 << (3 + i*4))) {
555 ops[i][3] = 0x2 | opInit[3];
556 opInit[3] = 1;
557 }
558 }
559 }
560
561 if (key.u.inType || key.u.outType) {
562 key.u.copyAlpha = 0;
563 ADD_CHUNK(prefix_f);
564 buf2 = buf;
565
566 // Load the incoming r,g,b,a as needed
567 if (key.u.inType) {
568 switch(key.u.inVecSize) {
569 case 3:
570 ADD_CHUNK(load_f32_4);
571 break;
572 case 2:
573 ADD_CHUNK(load_f32_3);
574 break;
575 case 1:
576 ADD_CHUNK(load_f32_2);
577 break;
578 case 0:
579 ADD_CHUNK(load_f32_1);
580 break;
581 }
582 } else {
583 switch(key.u.inVecSize) {
584 case 3:
585 ADD_CHUNK(load_u8f_4);
586 break;
587 case 2:
588 ADD_CHUNK(load_u8f_3);
589 break;
590 case 1:
591 ADD_CHUNK(load_u8f_2);
592 break;
593 case 0:
594 ADD_CHUNK(load_u8f_1);
595 break;
596 }
597 }
598
599 for (int i=0; i < 4; i++) {
600 for (int j=0; j < 4; j++) {
601 switch(ops[i][j]) {
602 case 0:
603 break;
604 case 2:
605 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
606 break;
607 case 3:
608 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
609 break;
610 }
611 }
612 }
613 for (int j=0; j < 4; j++) {
614 if (opInit[j]) {
615 if (key.u.addMask & (1 << j)) {
616 buf = addVADD_F32(buf, j, 12+j, 8+j);
617 } else {
618 buf = addVORR_32(buf, j, 12+j, 12+j);
619 }
620 } else {
621 if (key.u.addMask & (1 << j)) {
622 buf = addVORR_32(buf, j, 8+j, 8+j);
623 } else {
624 buf = addVMOV_32(buf, j, 0);
625 }
626 }
627 }
628
629 if (key.u.outType) {
630 switch(key.u.outVecSize) {
631 case 3:
632 ADD_CHUNK(store_f32_4);
633 break;
634 case 2:
635 ADD_CHUNK(store_f32_3);
636 break;
637 case 1:
638 ADD_CHUNK(store_f32_2);
639 break;
640 case 0:
641 ADD_CHUNK(store_f32_1);
642 break;
643 }
644 } else {
645 switch(key.u.outVecSize) {
646 case 3:
647 case 2:
648 ADD_CHUNK(store_f32u_4);
649 break;
650 case 1:
651 ADD_CHUNK(store_f32u_2);
652 break;
653 case 0:
654 ADD_CHUNK(store_f32u_1);
655 break;
656 }
657 }
658
659
660 } else {
661 // Add the function prefix
662 // Store the address for the loop return
663 ADD_CHUNK(prefix_i);
664 buf2 = buf;
665
666 // Load the incoming r,g,b,a as needed
667 switch(key.u.inVecSize) {
668 case 3:
669 ADD_CHUNK(load_u8_4);
670 if (key.u.copyAlpha) {
671 ADD_CHUNK(unpack_u8_3);
672 } else {
673 ADD_CHUNK(unpack_u8_4);
674 }
675 break;
676 case 2:
677 ADD_CHUNK(load_u8_3);
678 ADD_CHUNK(unpack_u8_3);
679 break;
680 case 1:
681 ADD_CHUNK(load_u8_2);
682 ADD_CHUNK(unpack_u8_2);
683 break;
684 case 0:
685 ADD_CHUNK(load_u8_1);
686 ADD_CHUNK(unpack_u8_1);
687 break;
688 }
689
690 // Add multiply and accumulate
691 // use MULL to init the output register,
692 // use MLAL from there
693 for (int i=0; i < 4; i++) {
694 for (int j=0; j < 4; j++) {
695 switch(ops[i][j]) {
696 case 0:
697 break;
698 case 2:
699 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
700 break;
701 case 3:
702 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
703 break;
704 }
705 }
706 }
707 for (int j=0; j < 4; j++) {
708 if (opInit[j]) {
709 if (key.u.addMask & (1 << j)) {
710 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
711 }
712 } else {
713 if (key.u.addMask & (1 << j)) {
714 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
715 }
716 }
717 }
718
719 // If we have a dot product, perform the special pack.
720 if (key.u.dot) {
721 ADD_CHUNK(pack_u8_1);
722 ADD_CHUNK(dot);
723 } else {
724 switch(key.u.outVecSize) {
725 case 3:
726 if (key.u.copyAlpha) {
727 ADD_CHUNK(pack_u8_3);
728 } else {
729 ADD_CHUNK(pack_u8_4);
730 }
731 break;
732 case 2:
733 ADD_CHUNK(pack_u8_3);
734 break;
735 case 1:
736 ADD_CHUNK(pack_u8_2);
737 break;
738 case 0:
739 ADD_CHUNK(pack_u8_1);
740 break;
741 }
742 }
743
744 // Write out result
745 switch(key.u.outVecSize) {
746 case 3:
747 case 2:
748 ADD_CHUNK(store_u8_4);
749 break;
750 case 1:
751 ADD_CHUNK(store_u8_2);
752 break;
753 case 0:
754 ADD_CHUNK(store_u8_1);
755 break;
756 }
757 }
758
759 if (key.u.inType != key.u.outType) {
760 key.u.copyAlpha = 0;
761 key.u.dot = 0;
762 }
763
764 // Loop, branch, and cleanup
765 ADD_CHUNK(postfix1);
766 buf = addBranch(buf, buf2, 0x01);
767 ADD_CHUNK(postfix2);
768
769 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
770 if (ret == -1) {
771 ALOGE("mprotect error %i", ret);
772 return false;
773 }
774
775 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
776 return true;
777 #else
778 (void) key; // Avoid unused parameter warning.
779 return false;
780 #endif
781 }
782
783 void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
784 for(int ct=0; ct < 16; ct++) {
785 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
786 mTmpFp[ct] = mFp[ct] * fpMul;
787 //ALOGE("mat %i %f %f", ct, mFp[ct], tmpFp[ct]);
788 }
789
790 float add = 0.f;
791 if (fpMul > 254.f) add = 0.5f;
792 for(int ct=0; ct < 4; ct++) {
793 mTmpFpa[ct] = mFpa[ct] * addMul + add;
794 //ALOGE("mFpa %i %f %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
795 }
796
797 for(int ct=0; ct < 4; ct++) {
798 mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
799 }
800 }
801
802
803
804 static void One(void *out,
805 const void *py, const float* coeff, const float *add,
806 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
807
808 float4 f = 0.f;
809 if (fin) {
810 switch(vsin) {
811 case 3:
812 f = ((const float4 *)py)[0];
813 break;
814 case 2:
815 f = ((const float4 *)py)[0];
816 f.w = 0.f;
817 break;
818 case 1:
819 f.xy = ((const float2 *)py)[0];
820 break;
821 case 0:
822 f.x = ((const float *)py)[0];
823 break;
824 }
825 } else {
826 switch(vsin) {
827 case 3:
828 f = convert<float4>(((const uchar4 *)py)[0]);
829 break;
830 case 2:
831 f = convert<float4>(((const uchar4 *)py)[0]);
832 f.w = 0.f;
833 break;
834 case 1:
835 f.xy = convert<float2>(((const uchar2 *)py)[0]);
836 break;
837 case 0:
838 f.x = (float)(((const uchar *)py)[0]);
839 break;
840 }
841 }
842 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
843
844 float4 sum;
845 sum.x = f.x * coeff[0] +
846 f.y * coeff[4] +
847 f.z * coeff[8] +
848 f.w * coeff[12];
849 sum.y = f.x * coeff[1] +
850 f.y * coeff[5] +
851 f.z * coeff[9] +
852 f.w * coeff[13];
853 sum.z = f.x * coeff[2] +
854 f.y * coeff[6] +
855 f.z * coeff[10] +
856 f.w * coeff[14];
857 sum.w = f.x * coeff[3] +
858 f.y * coeff[7] +
859 f.z * coeff[11] +
860 f.w * coeff[15];
861 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
862
863 sum.x += add[0];
864 sum.y += add[1];
865 sum.z += add[2];
866 sum.w += add[3];
867
868
869 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
870 if (fout) {
871 switch(vsout) {
872 case 3:
873 case 2:
874 ((float4 *)out)[0] = sum;
875 break;
876 case 1:
877 ((float2 *)out)[0] = sum.xy;
878 break;
879 case 0:
880 ((float *)out)[0] = sum.x;
881 break;
882 }
883 } else {
884 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
885 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
886 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
887 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
888
889 switch(vsout) {
890 case 3:
891 case 2:
892 ((uchar4 *)out)[0] = convert<uchar4>(sum);
893 break;
894 case 1:
895 ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
896 break;
897 case 0:
898 ((uchar *)out)[0] = sum.x;
899 break;
900 }
901 }
902 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
903 // ((float *)out)[3]);
904 }
905
906 void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
907 uint32_t x1 = xstart;
908 uint32_t x2 = xend;
909
910 uint32_t vsin = mLastKey.u.inVecSize;
911 uint32_t vsout = mLastKey.u.outVecSize;
912 bool floatIn = !!mLastKey.u.inType;
913 bool floatOut = !!mLastKey.u.outType;
914
915 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
916
917 if(x2 > x1) {
918 int32_t len = x2 - x1;
919 if (mUsesSimd) {
920 if((mOptKernel != nullptr) && (len >= 4)) {
921 // The optimized kernel processes 4 pixels at once
922 // and requires a minimum of 1 chunk of 4
923 mOptKernel(out, in, mIp, len >> 2);
924 // Update the len and pointers so the generic code can
925 // finish any leftover pixels
926 len &= ~3;
927 x1 += len;
928 out += mOutstep * len;
929 in += mInstep * len;
930 }
931 #if defined(ARCH_ARM64_USE_INTRINSICS)
932 else {
933 if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
934 mLastKey.u.outType == RS_TYPE_FLOAT_32) {
935 // Currently this generates off by one errors.
936 // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
937 // x1 += len;
938 // out += outstep * len;
939 // in += instep * len;
940 } else {
941 rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
942 x1 += len;
943 out += mOutstep * len;
944 in += mInstep * len;
945 }
946 }
947 #endif
948 }
949
950 while(x1 != x2) {
951 One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
952 out += mOutstep;
953 in += mInstep;
954 x1++;
955 }
956 }
957 }
958
959 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
960 void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
961 int outType) {
962 if (inType == outType) {
963 if (outType == RS_TYPE_UNSIGNED_8) {
964 updateCoeffCache(1.f, 255.f);
965 } else {
966 updateCoeffCache(1.f, 1.f);
967 }
968 } else {
969 if (outType == RS_TYPE_UNSIGNED_8) {
970 updateCoeffCache(255.f, 255.f);
971 } else {
972 updateCoeffCache(1.f / 255.f, 1.f);
973 }
974 }
975
976 Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
977 #else
978 void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
979 updateCoeffCache(1.f, 255.f);
980
981 Key_t key = computeKey(inVectorSize, outVectorSize);
982 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
983
984 #if defined(ARCH_X86_HAVE_SSSE3)
985 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
986 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
987 // mOptKernel =
988 // (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
989 mLastKey = key;
990 }
991
992 #else //if !defined(ARCH_X86_HAVE_SSSE3)
993 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
994 if (mBuf) munmap(mBuf, mBufSize);
995 mBuf = nullptr;
996 mOptKernel = nullptr;
997 if (build(key)) {
998 mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
999 }
1000 #if defined(ARCH_ARM64_USE_INTRINSICS)
1001 else {
1002 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
1003 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
1004 uint32_t mm = 0;
1005 int i;
1006 for (i = 0; i < 4; i++)
1007 {
1008 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
1009 m = ((m * 0x249) >> 9) & 15;
1010 m |= ((key.u.addMask >> i) & 1) << 4;
1011 mm |= m << (i * 5);
1012 }
1013
1014 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
1015 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1016 } else {
1017 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1018 }
1019 }
1020 #endif
1021 mLastKey = key;
1022 }
1023 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1024 }
1025
1026 void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
1027 size_t endY) {
1028 for (size_t y = startY; y < endY; y++) {
1029 size_t offset = mSizeX * y + startX;
1030 uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
1031 uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
1032 kernel(out, in, startX, endX);
1033 }
1034 }
1035
1036 static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
1037
1038 void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
1039 size_t outputVectorSize, size_t sizeX, size_t sizeY,
1040 const float* matrix, const float* addVector,
1041 const Restriction* restriction) {
1042 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
1043 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
1044 return;
1045 }
1046 if (inputVectorSize < 1 || inputVectorSize > 4) {
1047 ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
1048 return;
1049 }
1050 if (outputVectorSize < 1 || outputVectorSize > 4) {
1051 ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
1052 return;
1053 }
1054 #endif
1055
1056 if (addVector == nullptr) {
1057 addVector = fourZeroes;
1058 }
1059 ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
1060 addVector, restriction);
1061 processor->doTask(&task);
1062 }
1063
1064 } // namespace renderscript
1065