1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "RenderScriptToolkit.h"
18 #include "TaskProcessor.h"
19 #include "Utils.h"
20 #include <assert.h>
21 #include <cstdint>
22 #include <sys/mman.h>
23
24 namespace android {
25 namespace renderscript {
26
27 #define LOG_TAG "renderscript.toolkit.ColorMatrix"
28
29 /* uint kernel
30 * Q0 D0: Load slot for R
31 * D1: Load slot for G
32 * Q1 D2: Load slot for B
33 * D3: Load slot for A
34 * Q2 D4: Matrix
35 * D5: =
36 * Q3 D6: =
37 * D7: =
38 * Q4 D8: Add R
39 * D9:
40 * Q5 D10: Add G
41 * D11:
42 * Q6 D12: Add B
43 * D13:
44 * Q7 D14: Add A
45 * D15:
46 * Q8 D16: I32: R Sum
47 * D17:
48 * Q9 D18: I32: G Sum
49 * D19:
50 * Q10 D20: I32: B Sum
51 * D21:
52 * Q11 D22: I32: A Sum
53 * D23:
54 * Q12 D24: U16: expanded R
55 * D25:
56 * Q13 D26: U16: expanded G
57 * D27:
58 * Q14 D28: U16: expanded B
59 * D29:
60 * Q15 D30: U16: expanded A
61 * D31:
62 *
63 */
64
65 /* float kernel
66 * Q0 D0: Load slot for R
67 * D1: =
68 * Q1 D2: Load slot for G
69 * D3: =
70 * Q2 D4: Load slot for B
71 * D5: =
72 * Q3 D6: Load slot for A
73 * D7: =
74 * Q4 D8: Matrix
75 * D9: =
76 * Q5 D10: =
77 * D11: =
78 * Q6 D12: =
79 * D13: =
80 * Q7 D14: =
81 * D15: =
82 * Q8 D16: Add R
83 * D17: =
84 * Q9 D18: Add G
85 * D19: =
86 * Q10 D20: Add B
87 * D21: =
88 * Q11 D22: Add A
89 * D23: =
90 * Q12 D24: Sum R
91 * D25: =
92 * Q13 D26: Sum G
93 * D27: =
94 * Q14 D28: Sum B
95 * D29: =
96 * Q15 D30: Sum A
97 * D31: =
98 *
99 */
100
101 typedef union {
102 uint64_t key;
103 struct {
104 uint32_t inVecSize :2; // [0 - 1]
105 uint32_t outVecSize :2; // [2 - 3]
106 uint32_t inType :4; // [4 - 7]
107 uint32_t outType :4; // [8 - 11]
108 uint32_t dot :1; // [12]
109 uint32_t _unused1 :1; // [13]
110 uint32_t copyAlpha :1; // [14]
111 uint32_t _unused2 :1; // [15]
112 uint32_t coeffMask :16; // [16-31]
113 uint32_t addMask :4; // [32-35]
114 } u;
115 } Key_t;
116
117 /* The two data types and their value, as specified in the RenderScript documentation.
118 * Only RS_TYPE_UNSIGNED_8 is currently supported.
119 *
120 * TODO: The actual values of these constants are likely not important. We may be
121 * able to simplify the key related code.
122 */
123 const int RS_TYPE_UNSIGNED_8 = 8;
124 const int RS_TYPE_FLOAT_32 = 2;
125
126 //Re-enable when intrinsic is fixed
127 #if defined(ARCH_ARM64_USE_INTRINSICS)
128 typedef struct {
129 void (*column[4])(void);
130 void (*store)(void);
131 void (*load)(void);
132 void (*store_end)(void);
133 void (*load_end)(void);
134 } FunctionTab_t;
135
136 extern "C" void rsdIntrinsicColorMatrix_int_K(
137 void *out, void const *in, size_t count,
138 FunctionTab_t const *fns,
139 int16_t const *mult, int32_t const *add);
140
141 extern "C" void rsdIntrinsicColorMatrix_float_K(
142 void *out, void const *in, size_t count,
143 FunctionTab_t const *fns,
144 float const *mult, float const *add);
145
146 /* The setup functions fill in function tables to be used by above functions;
147 * this code also eliminates jump-to-another-jump cases by short-circuiting
148 * empty functions. While it's not performance critical, it works out easier
149 * to write the set-up code in assembly than to try to expose the same symbols
150 * and write the code in C.
151 */
152 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153 FunctionTab_t *fns,
154 uint32_t mask, int dt, int st);
155
156 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
157 FunctionTab_t *fns,
158 uint32_t mask, int dt, int st);
159 #endif
160
161 class ColorMatrixTask : public Task {
162 const void* mIn;
163 void* mOut;
164 size_t mInputVectorSize;
165 uint32_t mOutstep;
166 uint32_t mInstep;
167
168 float mFp[16];
169 float mFpa[4];
170
171 // The following four fields are read as constants
172 // by the SIMD assembly code.
173 int16_t mIp[16];
174 int mIpa[4];
175 float mTmpFp[16];
176 float mTmpFpa[4];
177 #if defined(ARCH_ARM64_USE_INTRINSICS)
178 FunctionTab_t mFnTab;
179 #endif
180
181 void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
182 void updateCoeffCache(float fpMul, float addMul);
183
184 Key_t mLastKey;
185 unsigned char* mBuf;
186 size_t mBufSize;
187
188 bool build(Key_t key);
189 void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
190
191 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
192 Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
193 void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
194 #else
195 Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
196 void preLaunch(size_t inVectorSize, size_t outVectorSize);
197 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
198
199 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
200 virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
201 size_t endY) override;
202
203 public:
ColorMatrixTask(const void * in,void * out,size_t inputVectorSize,size_t outputVectorSize,size_t sizeX,size_t sizeY,const float * matrix,const float * addVector,const Restriction * restriction)204 ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
205 size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
206 const Restriction* restriction)
207 : Task{sizeX, sizeY, outputVectorSize, true, restriction},
208 mIn{in},
209 mOut{out},
210 mInputVectorSize{inputVectorSize} {
211 mLastKey.key = 0;
212 mBuf = nullptr;
213 mBufSize = 0;
214 mOptKernel = nullptr;
215
216 mOutstep = paddedSize(outputVectorSize);
217 mInstep = paddedSize(inputVectorSize);
218
219 memcpy(mFp, matrix, sizeof(mFp));
220 memcpy(mFpa, addVector, sizeof(mFpa));
221 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
222 // For float support, we'll have to pass the type in the constructor too.
223 preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
224 #else
225 preLaunch(inputVectorSize, outputVectorSize);
226 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
227 }
~ColorMatrixTask()228 ~ColorMatrixTask() {
229 if (mBuf) munmap(mBuf, mBufSize);
230 mBuf = nullptr;
231 mOptKernel = nullptr;
232 }
233 };
234
235 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
computeKey(size_t inVectorSize,int inType,size_t outVectorSize,int outType)236 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
237 int outType) {
238 Key_t key;
239 key.key = 0;
240
241 // Compute a unique code key for this operation
242
243 // Add to the key the input and output types
244 bool hasFloat = false;
245 if (inType == RS_TYPE_FLOAT_32) {
246 hasFloat = true;
247 key.u.inType = RS_TYPE_FLOAT_32;
248 }
249 if (outType == RS_TYPE_FLOAT_32) {
250 hasFloat = true;
251 key.u.outType = RS_TYPE_FLOAT_32;
252 }
253
254 // Mask in the bits indicating which coefficients in the
255 // color matrix are needed.
256 if (hasFloat) {
257 for (uint32_t i=0; i < 16; i++) {
258 if (fabs(mFp[i]) != 0.f) {
259 key.u.coeffMask |= 1 << i;
260 }
261 }
262 if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
263 if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
264 if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
265 if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
266
267 } else {
268 #else
269 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
270 Key_t key;
271 key.key = 0;
272
273 // Compute a unique code key for this operation
274 {
275 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
276
277 for (uint32_t i=0; i < 16; i++) {
278 if (mIp[i] != 0) {
279 key.u.coeffMask |= 1 << i;
280 }
281 }
282 if (mIpa[0] != 0) key.u.addMask |= 0x1;
283 if (mIpa[1] != 0) key.u.addMask |= 0x2;
284 if (mIpa[2] != 0) key.u.addMask |= 0x4;
285 if (mIpa[3] != 0) key.u.addMask |= 0x8;
286 }
287
288 // Look for a dot product where the r,g,b colums are the same
289 if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
290 (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
291 (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
292 (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
293
294 if (!key.u.addMask) key.u.dot = 1;
295 }
296
297 // Is alpha a simple copy
298 if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
299 key.u.copyAlpha = !(key.u.inType || key.u.outType);
300 }
301
302 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
303
304 switch (inVectorSize) {
305 case 4:
306 key.u.inVecSize = 3;
307 break;
308 case 3:
309 key.u.inVecSize = 2;
310 key.u.coeffMask &= ~0xF000;
311 break;
312 case 2:
313 key.u.inVecSize = 1;
314 key.u.coeffMask &= ~0xFF00;
315 break;
316 default:
317 key.u.coeffMask &= ~0xFFF0;
318 break;
319 }
320
321 switch (outVectorSize) {
322 case 4:
323 key.u.outVecSize = 3;
324 break;
325 case 3:
326 key.u.outVecSize = 2;
327 key.u.coeffMask &= ~0x8888;
328 key.u.addMask &= 7;
329 break;
330 case 2:
331 key.u.outVecSize = 1;
332 key.u.coeffMask &= ~0xCCCC;
333 key.u.addMask &= 3;
334 break;
335 default:
336 key.u.coeffMask &= ~0xEEEE;
337 key.u.addMask &= 1;
338 break;
339 }
340
341 if (key.u.inType && !key.u.outType) {
342 key.u.addMask |= 1;
343 if (key.u.outVecSize > 0) key.u.addMask |= 2;
344 if (key.u.outVecSize > 1) key.u.addMask |= 4;
345 if (key.u.outVecSize > 2) key.u.addMask |= 8;
346 }
347
348 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
349 return key;
350 }
351
352 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
353
354 #define DEF_SYM(x) \
355 extern "C" uint32_t _N_ColorMatrix_##x; \
356 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
357 extern "C" uint32_t _N_ColorMatrix_##x##_len;
358
359 DEF_SYM(prefix_i)
360 DEF_SYM(prefix_f)
361 DEF_SYM(postfix1)
362 DEF_SYM(postfix2)
363
364 DEF_SYM(load_u8_4)
365 DEF_SYM(load_u8_3)
366 DEF_SYM(load_u8_2)
367 DEF_SYM(load_u8_1)
368 DEF_SYM(load_u8f_4)
369 DEF_SYM(load_u8f_3)
370 DEF_SYM(load_u8f_2)
371 DEF_SYM(load_u8f_1)
372 DEF_SYM(load_f32_4)
373 DEF_SYM(load_f32_3)
374 DEF_SYM(load_f32_2)
375 DEF_SYM(load_f32_1)
376
377 DEF_SYM(store_u8_4)
378 DEF_SYM(store_u8_2)
379 DEF_SYM(store_u8_1)
380 DEF_SYM(store_f32_4)
381 DEF_SYM(store_f32_3)
382 DEF_SYM(store_f32_2)
383 DEF_SYM(store_f32_1)
384 DEF_SYM(store_f32u_4)
385 DEF_SYM(store_f32u_2)
386 DEF_SYM(store_f32u_1)
387
388 DEF_SYM(unpack_u8_4)
389 DEF_SYM(unpack_u8_3)
390 DEF_SYM(unpack_u8_2)
391 DEF_SYM(unpack_u8_1)
392 DEF_SYM(pack_u8_4)
393 DEF_SYM(pack_u8_3)
394 DEF_SYM(pack_u8_2)
395 DEF_SYM(pack_u8_1)
396 DEF_SYM(dot)
397 DEF_SYM(add_0_u8)
398 DEF_SYM(add_1_u8)
399 DEF_SYM(add_2_u8)
400 DEF_SYM(add_3_u8)
401
402 #define ADD_CHUNK(x) \
403 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
404 buf += _N_ColorMatrix_##x##_len
405
406
407 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
408 size_t off = (target - buf - 8) >> 2;
409 assert(((off & 0xff000000) == 0) ||
410 ((off & 0xff000000) == 0xff000000));
411
412 uint32_t op = (condition << 28);
413 op |= 0xa << 24; // branch
414 op |= 0xffffff & off;
415 ((uint32_t *)buf)[0] = op;
416 return buf + 4;
417 }
418
419 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
420 assert(vd < 32);
421 assert(vm < 32);
422 assert(vn < 32);
423
424 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
425 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
426 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
427 return op;
428 }
429
430 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
431 uint32_t src_d2_s) {
432 //vmlal.s16 Q#1, D#1, D#2[#]
433 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
434 ((uint32_t *)buf)[0] = op;
435 return buf + 4;
436 }
437
438 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
439 uint32_t src_d2_s) {
440 //vmull.s16 Q#1, D#1, D#2[#]
441 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
442 ((uint32_t *)buf)[0] = op;
443 return buf + 4;
444 }
445
446 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
447 //vqadd.s32 Q#1, Q#1, Q#2
448 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
449 ((uint32_t *)buf)[0] = op;
450 return buf + 4;
451 }
452
453 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
454 uint32_t src_d2_s) {
455 //vmlal.f32 Q#1, D#1, D#2[#]
456 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
457 ((uint32_t *)buf)[0] = op;
458 return buf + 4;
459 }
460
461 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
462 uint32_t src_d2_s) {
463 //vmull.f32 Q#1, D#1, D#2[#]
464 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
465 ((uint32_t *)buf)[0] = op;
466 return buf + 4;
467 }
468
469 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
470 //vadd.f32 Q#1, D#1, D#2
471 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
472 ((uint32_t *)buf)[0] = op;
473 return buf + 4;
474 }
475
476 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
477 //vmov.32 Q#1, #imm
478 assert(imm == 0);
479 (void) imm; // Avoid unused parameter warnings for non-debug builds
480 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
481 ((uint32_t *)buf)[0] = op;
482 return buf + 4;
483 }
484
485 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
486 //vadd.f32 Q#1, D#1, D#2
487 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
488 ((uint32_t *)buf)[0] = op;
489 return buf + 4;
490 }
491 #endif
492
493 #if defined(ARCH_X86_HAVE_SSSE3)
494 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
495 const int16_t *coef, uint32_t count);
496 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
497 const int16_t *coef, uint32_t count);
498 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
499 const int16_t *coef, uint32_t count);
500
501 using android::renderscript::Key_t;
502
503 void * selectKernel(Key_t key)
504 {
505 void * kernel = nullptr;
506
507 // inType, outType float if nonzero
508 if (!(key.u.inType || key.u.outType)) {
509 if (key.u.dot)
510 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
511 else if (key.u.copyAlpha)
512 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
513 else
514 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
515 }
516
517 return kernel;
518 }
519 #endif
520
521 bool ColorMatrixTask::build(Key_t key) {
522 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
523 mBufSize = 4096;
524 //StopWatch build_time("rs cm: build time");
525 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
526 MAP_PRIVATE | MAP_ANON, -1, 0);
527 if (mBuf == MAP_FAILED) {
528 mBuf = NULL;
529 return false;
530 }
531
532 uint8_t *buf = mBuf;
533 uint8_t *buf2 = nullptr;
534
535 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
536 int opInit[4] = {0, 0, 0, 0};
537
538 memset(ops, 0, sizeof(ops));
539 for (int i=0; i < 4; i++) {
540 if (key.u.coeffMask & (1 << (i*4))) {
541 ops[i][0] = 0x2 | opInit[0];
542 opInit[0] = 1;
543 }
544 if (!key.u.dot) {
545 if (key.u.coeffMask & (1 << (1 + i*4))) {
546 ops[i][1] = 0x2 | opInit[1];
547 opInit[1] = 1;
548 }
549 if (key.u.coeffMask & (1 << (2 + i*4))) {
550 ops[i][2] = 0x2 | opInit[2];
551 opInit[2] = 1;
552 }
553 }
554 if (!key.u.copyAlpha) {
555 if (key.u.coeffMask & (1 << (3 + i*4))) {
556 ops[i][3] = 0x2 | opInit[3];
557 opInit[3] = 1;
558 }
559 }
560 }
561
562 if (key.u.inType || key.u.outType) {
563 key.u.copyAlpha = 0;
564 ADD_CHUNK(prefix_f);
565 buf2 = buf;
566
567 // Load the incoming r,g,b,a as needed
568 if (key.u.inType) {
569 switch(key.u.inVecSize) {
570 case 3:
571 ADD_CHUNK(load_f32_4);
572 break;
573 case 2:
574 ADD_CHUNK(load_f32_3);
575 break;
576 case 1:
577 ADD_CHUNK(load_f32_2);
578 break;
579 case 0:
580 ADD_CHUNK(load_f32_1);
581 break;
582 }
583 } else {
584 switch(key.u.inVecSize) {
585 case 3:
586 ADD_CHUNK(load_u8f_4);
587 break;
588 case 2:
589 ADD_CHUNK(load_u8f_3);
590 break;
591 case 1:
592 ADD_CHUNK(load_u8f_2);
593 break;
594 case 0:
595 ADD_CHUNK(load_u8f_1);
596 break;
597 }
598 }
599
600 for (int i=0; i < 4; i++) {
601 for (int j=0; j < 4; j++) {
602 switch(ops[i][j]) {
603 case 0:
604 break;
605 case 2:
606 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
607 break;
608 case 3:
609 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
610 break;
611 }
612 }
613 }
614 for (int j=0; j < 4; j++) {
615 if (opInit[j]) {
616 if (key.u.addMask & (1 << j)) {
617 buf = addVADD_F32(buf, j, 12+j, 8+j);
618 } else {
619 buf = addVORR_32(buf, j, 12+j, 12+j);
620 }
621 } else {
622 if (key.u.addMask & (1 << j)) {
623 buf = addVORR_32(buf, j, 8+j, 8+j);
624 } else {
625 buf = addVMOV_32(buf, j, 0);
626 }
627 }
628 }
629
630 if (key.u.outType) {
631 switch(key.u.outVecSize) {
632 case 3:
633 ADD_CHUNK(store_f32_4);
634 break;
635 case 2:
636 ADD_CHUNK(store_f32_3);
637 break;
638 case 1:
639 ADD_CHUNK(store_f32_2);
640 break;
641 case 0:
642 ADD_CHUNK(store_f32_1);
643 break;
644 }
645 } else {
646 switch(key.u.outVecSize) {
647 case 3:
648 case 2:
649 ADD_CHUNK(store_f32u_4);
650 break;
651 case 1:
652 ADD_CHUNK(store_f32u_2);
653 break;
654 case 0:
655 ADD_CHUNK(store_f32u_1);
656 break;
657 }
658 }
659
660
661 } else {
662 // Add the function prefix
663 // Store the address for the loop return
664 ADD_CHUNK(prefix_i);
665 buf2 = buf;
666
667 // Load the incoming r,g,b,a as needed
668 switch(key.u.inVecSize) {
669 case 3:
670 ADD_CHUNK(load_u8_4);
671 if (key.u.copyAlpha) {
672 ADD_CHUNK(unpack_u8_3);
673 } else {
674 ADD_CHUNK(unpack_u8_4);
675 }
676 break;
677 case 2:
678 ADD_CHUNK(load_u8_3);
679 ADD_CHUNK(unpack_u8_3);
680 break;
681 case 1:
682 ADD_CHUNK(load_u8_2);
683 ADD_CHUNK(unpack_u8_2);
684 break;
685 case 0:
686 ADD_CHUNK(load_u8_1);
687 ADD_CHUNK(unpack_u8_1);
688 break;
689 }
690
691 // Add multiply and accumulate
692 // use MULL to init the output register,
693 // use MLAL from there
694 for (int i=0; i < 4; i++) {
695 for (int j=0; j < 4; j++) {
696 switch(ops[i][j]) {
697 case 0:
698 break;
699 case 2:
700 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
701 break;
702 case 3:
703 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
704 break;
705 }
706 }
707 }
708 for (int j=0; j < 4; j++) {
709 if (opInit[j]) {
710 if (key.u.addMask & (1 << j)) {
711 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
712 }
713 } else {
714 if (key.u.addMask & (1 << j)) {
715 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
716 }
717 }
718 }
719
720 // If we have a dot product, perform the special pack.
721 if (key.u.dot) {
722 ADD_CHUNK(pack_u8_1);
723 ADD_CHUNK(dot);
724 } else {
725 switch(key.u.outVecSize) {
726 case 3:
727 if (key.u.copyAlpha) {
728 ADD_CHUNK(pack_u8_3);
729 } else {
730 ADD_CHUNK(pack_u8_4);
731 }
732 break;
733 case 2:
734 ADD_CHUNK(pack_u8_3);
735 break;
736 case 1:
737 ADD_CHUNK(pack_u8_2);
738 break;
739 case 0:
740 ADD_CHUNK(pack_u8_1);
741 break;
742 }
743 }
744
745 // Write out result
746 switch(key.u.outVecSize) {
747 case 3:
748 case 2:
749 ADD_CHUNK(store_u8_4);
750 break;
751 case 1:
752 ADD_CHUNK(store_u8_2);
753 break;
754 case 0:
755 ADD_CHUNK(store_u8_1);
756 break;
757 }
758 }
759
760 if (key.u.inType != key.u.outType) {
761 key.u.copyAlpha = 0;
762 key.u.dot = 0;
763 }
764
765 // Loop, branch, and cleanup
766 ADD_CHUNK(postfix1);
767 buf = addBranch(buf, buf2, 0x01);
768 ADD_CHUNK(postfix2);
769
770 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
771 if (ret == -1) {
772 ALOGE("mprotect error %i", ret);
773 return false;
774 }
775
776 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
777 return true;
778 #else
779 (void) key; // Avoid unused parameter warning.
780 return false;
781 #endif
782 }
783
784 void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
785 for(int ct=0; ct < 16; ct++) {
786 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
787 mTmpFp[ct] = mFp[ct] * fpMul;
788 //ALOGE("mat %i %f %f", ct, mFp[ct], tmpFp[ct]);
789 }
790
791 float add = 0.f;
792 if (fpMul > 254.f) add = 0.5f;
793 for(int ct=0; ct < 4; ct++) {
794 mTmpFpa[ct] = mFpa[ct] * addMul + add;
795 //ALOGE("mFpa %i %f %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
796 }
797
798 for(int ct=0; ct < 4; ct++) {
799 mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
800 }
801 }
802
803
804
805 static void One(void *out,
806 const void *py, const float* coeff, const float *add,
807 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
808
809 float4 f = 0.f;
810 if (fin) {
811 switch(vsin) {
812 case 3:
813 f = ((const float4 *)py)[0];
814 break;
815 case 2:
816 f = ((const float4 *)py)[0];
817 f.w = 0.f;
818 break;
819 case 1:
820 f.xy = ((const float2 *)py)[0];
821 break;
822 case 0:
823 f.x = ((const float *)py)[0];
824 break;
825 }
826 } else {
827 switch(vsin) {
828 case 3:
829 f = convert<float4>(((const uchar4 *)py)[0]);
830 break;
831 case 2:
832 f = convert<float4>(((const uchar4 *)py)[0]);
833 f.w = 0.f;
834 break;
835 case 1:
836 f.xy = convert<float2>(((const uchar2 *)py)[0]);
837 break;
838 case 0:
839 f.x = (float)(((const uchar *)py)[0]);
840 break;
841 }
842 }
843 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
844
845 float4 sum;
846 sum.x = f.x * coeff[0] +
847 f.y * coeff[4] +
848 f.z * coeff[8] +
849 f.w * coeff[12];
850 sum.y = f.x * coeff[1] +
851 f.y * coeff[5] +
852 f.z * coeff[9] +
853 f.w * coeff[13];
854 sum.z = f.x * coeff[2] +
855 f.y * coeff[6] +
856 f.z * coeff[10] +
857 f.w * coeff[14];
858 sum.w = f.x * coeff[3] +
859 f.y * coeff[7] +
860 f.z * coeff[11] +
861 f.w * coeff[15];
862 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
863
864 sum.x += add[0];
865 sum.y += add[1];
866 sum.z += add[2];
867 sum.w += add[3];
868
869
870 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
871 if (fout) {
872 switch(vsout) {
873 case 3:
874 case 2:
875 ((float4 *)out)[0] = sum;
876 break;
877 case 1:
878 ((float2 *)out)[0] = sum.xy;
879 break;
880 case 0:
881 ((float *)out)[0] = sum.x;
882 break;
883 }
884 } else {
885 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
886 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
887 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
888 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
889
890 switch(vsout) {
891 case 3:
892 case 2:
893 ((uchar4 *)out)[0] = convert<uchar4>(sum);
894 break;
895 case 1:
896 ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
897 break;
898 case 0:
899 ((uchar *)out)[0] = sum.x;
900 break;
901 }
902 }
903 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
904 // ((float *)out)[3]);
905 }
906
907 void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
908 uint32_t x1 = xstart;
909 uint32_t x2 = xend;
910
911 uint32_t vsin = mLastKey.u.inVecSize;
912 uint32_t vsout = mLastKey.u.outVecSize;
913 bool floatIn = !!mLastKey.u.inType;
914 bool floatOut = !!mLastKey.u.outType;
915
916 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
917
918 if(x2 > x1) {
919 int32_t len = x2 - x1;
920 if (mUsesSimd) {
921 if((mOptKernel != nullptr) && (len >= 4)) {
922 // The optimized kernel processes 4 pixels at once
923 // and requires a minimum of 1 chunk of 4
924 mOptKernel(out, in, mIp, len >> 2);
925 // Update the len and pointers so the generic code can
926 // finish any leftover pixels
927 len &= ~3;
928 x1 += len;
929 out += mOutstep * len;
930 in += mInstep * len;
931 }
932 #if defined(ARCH_ARM64_USE_INTRINSICS)
933 else {
934 if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
935 mLastKey.u.outType == RS_TYPE_FLOAT_32) {
936 // Currently this generates off by one errors.
937 // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
938 // x1 += len;
939 // out += outstep * len;
940 // in += instep * len;
941 } else {
942 rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
943 x1 += len;
944 out += mOutstep * len;
945 in += mInstep * len;
946 }
947 }
948 #endif
949 }
950
951 while(x1 != x2) {
952 One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
953 out += mOutstep;
954 in += mInstep;
955 x1++;
956 }
957 }
958 }
959
960 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
961 void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
962 int outType) {
963 if (inType == outType) {
964 if (outType == RS_TYPE_UNSIGNED_8) {
965 updateCoeffCache(1.f, 255.f);
966 } else {
967 updateCoeffCache(1.f, 1.f);
968 }
969 } else {
970 if (outType == RS_TYPE_UNSIGNED_8) {
971 updateCoeffCache(255.f, 255.f);
972 } else {
973 updateCoeffCache(1.f / 255.f, 1.f);
974 }
975 }
976
977 Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
978 #else
979 void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
980 updateCoeffCache(1.f, 255.f);
981
982 Key_t key = computeKey(inVectorSize, outVectorSize);
983 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
984
985 #if defined(ARCH_X86_HAVE_SSSE3)
986 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
987 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
988 // mOptKernel =
989 // (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
990 mLastKey = key;
991 }
992
993 #else //if !defined(ARCH_X86_HAVE_SSSE3)
994 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
995 if (mBuf) munmap(mBuf, mBufSize);
996 mBuf = nullptr;
997 mOptKernel = nullptr;
998 if (build(key)) {
999 mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
1000 }
1001 #if defined(ARCH_ARM64_USE_INTRINSICS)
1002 else {
1003 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
1004 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
1005 uint32_t mm = 0;
1006 int i;
1007 for (i = 0; i < 4; i++)
1008 {
1009 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
1010 m = ((m * 0x249) >> 9) & 15;
1011 m |= ((key.u.addMask >> i) & 1) << 4;
1012 mm |= m << (i * 5);
1013 }
1014
1015 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
1016 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1017 } else {
1018 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1019 }
1020 }
1021 #endif
1022 mLastKey = key;
1023 }
1024 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1025 }
1026
1027 void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
1028 size_t endY) {
1029 for (size_t y = startY; y < endY; y++) {
1030 size_t offset = mSizeX * y + startX;
1031 uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
1032 uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
1033 kernel(out, in, startX, endX);
1034 }
1035 }
1036
1037 static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
1038
1039 void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
1040 size_t outputVectorSize, size_t sizeX, size_t sizeY,
1041 const float* matrix, const float* addVector,
1042 const Restriction* restriction) {
1043 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
1044 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
1045 return;
1046 }
1047 if (inputVectorSize < 1 || inputVectorSize > 4) {
1048 ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
1049 return;
1050 }
1051 if (outputVectorSize < 1 || outputVectorSize > 4) {
1052 ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
1053 return;
1054 }
1055 #endif
1056
1057 if (addVector == nullptr) {
1058 addVector = fourZeroes;
1059 }
1060 ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
1061 addVector, restriction);
1062 processor->doTask(&task);
1063 }
1064
1065 } // namespace renderscript
1066 } // namespace android
1067