• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "RenderScriptToolkit.h"
18 #include "TaskProcessor.h"
19 #include "Utils.h"
20 #include <cassert>
21 #include <cstdint>
22 #include <sys/mman.h>
23 
24 namespace renderscript {
25 
26 #define LOG_TAG "renderscript.toolkit.ColorMatrix"
27 
28 /*  uint kernel
29  *  Q0  D0:  Load slot for R
30  *      D1:  Load slot for G
31  *  Q1  D2:  Load slot for B
32  *      D3:  Load slot for A
33  *  Q2  D4:  Matrix
34  *      D5:  =
35  *  Q3  D6:  =
36  *      D7:  =
37  *  Q4  D8:  Add R
38  *      D9:
39  *  Q5  D10: Add G
40  *      D11:
41  *  Q6  D12: Add B
42  *      D13:
43  *  Q7  D14: Add A
44  *      D15:
45  *  Q8  D16:  I32: R Sum
46  *      D17:
47  *  Q9  D18:  I32: G Sum
48  *      D19:
49  *  Q10 D20:  I32: B Sum
50  *      D21:
51  *  Q11 D22:  I32: A Sum
52  *      D23:
53  *  Q12 D24:  U16: expanded R
54  *      D25:
55  *  Q13 D26:  U16: expanded G
56  *      D27:
57  *  Q14 D28:  U16: expanded B
58  *      D29:
59  *  Q15 D30:  U16: expanded A
60  *      D31:
61  *
62  */
63 
64 /*  float kernel
65  *  Q0  D0:  Load slot for R
66  *      D1:  =
67  *  Q1  D2:  Load slot for G
68  *      D3:  =
69  *  Q2  D4:  Load slot for B
70  *      D5:  =
71  *  Q3  D6:  Load slot for A
72  *      D7:  =
73  *  Q4  D8:  Matrix
74  *      D9:  =
75  *  Q5  D10: =
76  *      D11: =
77  *  Q6  D12: =
78  *      D13: =
79  *  Q7  D14: =
80  *      D15: =
81  *  Q8  D16: Add R
82  *      D17: =
83  *  Q9  D18: Add G
84  *      D19: =
85  *  Q10 D20: Add B
86  *      D21: =
87  *  Q11 D22: Add A
88  *      D23: =
89  *  Q12 D24: Sum R
90  *      D25: =
91  *  Q13 D26: Sum G
92  *      D27: =
93  *  Q14 D28: Sum B
94  *      D29: =
95  *  Q15 D30: Sum A
96  *      D31: =
97  *
98  */
99 
100 typedef union {
101     uint64_t key;
102     struct {
103         uint32_t inVecSize          :2;  // [0 - 1]
104         uint32_t outVecSize         :2;  // [2 - 3]
105         uint32_t inType             :4;  // [4 - 7]
106         uint32_t outType            :4;  // [8 - 11]
107         uint32_t dot                :1;  // [12]
108         uint32_t _unused1           :1;  // [13]
109         uint32_t copyAlpha          :1;  // [14]
110         uint32_t _unused2           :1;  // [15]
111         uint32_t coeffMask          :16; // [16-31]
112         uint32_t addMask            :4;  // [32-35]
113     } u;
114 } Key_t;
115 
116 /* The two data types and their value, as specified in the RenderScript documentation.
117  * Only RS_TYPE_UNSIGNED_8 is currently supported.
118  *
119  * TODO: The actual values of these constants are likely not important. We may be
120  * able to simplify the key related code.
121  */
122 const int RS_TYPE_UNSIGNED_8 = 8;
123 const int RS_TYPE_FLOAT_32 = 2;
124 
125 //Re-enable when intrinsic is fixed
126 #if defined(ARCH_ARM64_USE_INTRINSICS)
127 typedef struct {
128     void (*column[4])();
129     void (*store)();
130     void (*load)();
131     void (*store_end)();
132     void (*load_end)();
133 } FunctionTab_t;
134 
135 extern "C" void rsdIntrinsicColorMatrix_int_K(
136              void *out, void const *in, size_t count,
137              FunctionTab_t const *fns,
138              int16_t const *mult, int32_t const *add);
139 
140 extern "C" void rsdIntrinsicColorMatrix_float_K(
141              void *out, void const *in, size_t count,
142              FunctionTab_t const *fns,
143              float const *mult, float const *add);
144 
145 /* The setup functions fill in function tables to be used by above functions;
146  * this code also eliminates jump-to-another-jump cases by short-circuiting
147  * empty functions.  While it's not performance critical, it works out easier
148  * to write the set-up code in assembly than to try to expose the same symbols
149  * and write the code in C.
150  */
151 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
152              FunctionTab_t *fns,
153              uint32_t mask, int dt, int st);
154 
155 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
156              FunctionTab_t *fns,
157              uint32_t mask, int dt, int st);
158 #endif //  ARCH_ARM64_USE_INTRINSICS
159 
160 class ColorMatrixTask : public Task {
161     const void* mIn;
162     void* mOut;
163     size_t mInputVectorSize;
164     uint32_t mOutstep;
165     uint32_t mInstep;
166 
167     float mFp[16];
168     float mFpa[4];
169 
170     // The following four fields are read as constants
171     // by the SIMD assembly code.
172     int16_t mIp[16];
173     int mIpa[4];
174     float mTmpFp[16];
175     float mTmpFpa[4];
176 #if defined(ARCH_ARM64_USE_INTRINSICS)
177     FunctionTab_t mFnTab;
178 #endif
179 
180     void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
181     void updateCoeffCache(float fpMul, float addMul);
182 
183     Key_t mLastKey;
184     unsigned char* mBuf;
185     size_t mBufSize;
186 
187     bool build(Key_t key);
188     void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
189 
190 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
191     Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
192     void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
193 #else
194     Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
195     void preLaunch(size_t inVectorSize, size_t outVectorSize);
196 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
197 
198     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
199     void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
200                      size_t endY) override;
201 
202    public:
ColorMatrixTask(const void * in,void * out,size_t inputVectorSize,size_t outputVectorSize,size_t sizeX,size_t sizeY,const float * matrix,const float * addVector,const Restriction * restriction)203     ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
204                     size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
205                     const Restriction* restriction)
206         : Task{sizeX, sizeY, outputVectorSize, true, restriction},
207           mIn{in},
208           mOut{out},
209           mInputVectorSize{inputVectorSize} {
210         mLastKey.key = 0;
211         mBuf = nullptr;
212         mBufSize = 0;
213         mOptKernel = nullptr;
214 
215         mOutstep = paddedSize(outputVectorSize);
216         mInstep = paddedSize(inputVectorSize);
217 
218         memcpy(mFp, matrix, sizeof(mFp));
219         memcpy(mFpa, addVector, sizeof(mFpa));
220 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
221         // For float support, we'll have to pass the type in the constructor too.
222         preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
223 #else
224         preLaunch(inputVectorSize, outputVectorSize);
225 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
226     }
~ColorMatrixTask()227     ~ColorMatrixTask() {
228         if (mBuf) munmap(mBuf, mBufSize);
229         mBuf = nullptr;
230         mOptKernel = nullptr;
231     }
232 };
233 
234 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
computeKey(size_t inVectorSize,int inType,size_t outVectorSize,int outType)235 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
236                                   int outType) {
237     Key_t key;
238     key.key = 0;
239 
240     // Compute a unique code key for this operation
241 
242     // Add to the key the input and output types
243     bool hasFloat = false;
244     if (inType == RS_TYPE_FLOAT_32) {
245         hasFloat = true;
246         key.u.inType = RS_TYPE_FLOAT_32;
247     }
248     if (outType == RS_TYPE_FLOAT_32) {
249         hasFloat = true;
250         key.u.outType = RS_TYPE_FLOAT_32;
251     }
252 
253     // Mask in the bits indicating which coefficients in the
254     // color matrix are needed.
255     if (hasFloat) {
256         for (uint32_t i=0; i < 16; i++) {
257             if (fabs(mFp[i]) != 0.f) {
258                 key.u.coeffMask |= 1 << i;
259             }
260         }
261         if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
262         if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
263         if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
264         if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
265 
266     } else {
267 #else
268 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
269     Key_t key;
270     key.key = 0;
271 
272     // Compute a unique code key for this operation
273     {
274 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
275 
276         for (uint32_t i=0; i < 16; i++) {
277             if (mIp[i] != 0) {
278                 key.u.coeffMask |= 1 << i;
279             }
280         }
281         if (mIpa[0] != 0) key.u.addMask |= 0x1;
282         if (mIpa[1] != 0) key.u.addMask |= 0x2;
283         if (mIpa[2] != 0) key.u.addMask |= 0x4;
284         if (mIpa[3] != 0) key.u.addMask |= 0x8;
285     }
286 
287     // Look for a dot product where the r,g,b colums are the same
288     if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
289         (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
290         (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
291         (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
292 
293         if (!key.u.addMask) key.u.dot = 1;
294     }
295 
296     // Is alpha a simple copy
297     if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
298         key.u.copyAlpha = !(key.u.inType || key.u.outType);
299     }
300 
301     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
302 
303     switch (inVectorSize) {
304     case 4:
305         key.u.inVecSize = 3;
306         break;
307     case 3:
308         key.u.inVecSize = 2;
309         key.u.coeffMask &= ~0xF000;
310         break;
311     case 2:
312         key.u.inVecSize = 1;
313         key.u.coeffMask &= ~0xFF00;
314         break;
315     default:
316         key.u.coeffMask &= ~0xFFF0;
317         break;
318     }
319 
320     switch (outVectorSize) {
321     case 4:
322         key.u.outVecSize = 3;
323         break;
324     case 3:
325         key.u.outVecSize = 2;
326         key.u.coeffMask &= ~0x8888;
327         key.u.addMask &= 7;
328         break;
329     case 2:
330         key.u.outVecSize = 1;
331         key.u.coeffMask &= ~0xCCCC;
332         key.u.addMask &= 3;
333         break;
334     default:
335         key.u.coeffMask &= ~0xEEEE;
336         key.u.addMask &= 1;
337         break;
338     }
339 
340     if (key.u.inType && !key.u.outType) {
341         key.u.addMask |= 1;
342         if (key.u.outVecSize > 0) key.u.addMask |= 2;
343         if (key.u.outVecSize > 1) key.u.addMask |= 4;
344         if (key.u.outVecSize > 2) key.u.addMask |= 8;
345     }
346 
347     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
348     return key;
349 }
350 
351 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
352 
353 #define DEF_SYM(x)                                  \
354     extern "C" uint32_t _N_ColorMatrix_##x;      \
355     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
356     extern "C" uint32_t _N_ColorMatrix_##x##_len;
357 
358 DEF_SYM(prefix_i)
359 DEF_SYM(prefix_f)
360 DEF_SYM(postfix1)
361 DEF_SYM(postfix2)
362 
363 DEF_SYM(load_u8_4)
364 DEF_SYM(load_u8_3)
365 DEF_SYM(load_u8_2)
366 DEF_SYM(load_u8_1)
367 DEF_SYM(load_u8f_4)
368 DEF_SYM(load_u8f_3)
369 DEF_SYM(load_u8f_2)
370 DEF_SYM(load_u8f_1)
371 
372 DEF_SYM(load_f32_4)
373 DEF_SYM(load_f32_3)
374 DEF_SYM(load_f32_2)
375 DEF_SYM(load_f32_1)
376 
377 DEF_SYM(store_u8_4)
378 DEF_SYM(store_u8_2)
379 DEF_SYM(store_u8_1)
380 
381 DEF_SYM(store_f32_4)
382 DEF_SYM(store_f32_3)
383 DEF_SYM(store_f32_2)
384 DEF_SYM(store_f32_1)
385 DEF_SYM(store_f32u_4)
386 DEF_SYM(store_f32u_2)
387 DEF_SYM(store_f32u_1)
388 
389 DEF_SYM(unpack_u8_4)
390 DEF_SYM(unpack_u8_3)
391 DEF_SYM(unpack_u8_2)
392 DEF_SYM(unpack_u8_1)
393 DEF_SYM(pack_u8_4)
394 DEF_SYM(pack_u8_3)
395 DEF_SYM(pack_u8_2)
396 DEF_SYM(pack_u8_1)
397 DEF_SYM(dot)
398 DEF_SYM(add_0_u8)
399 DEF_SYM(add_1_u8)
400 DEF_SYM(add_2_u8)
401 DEF_SYM(add_3_u8)
402 
403 #define ADD_CHUNK(x) \
404     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
405     buf += _N_ColorMatrix_##x##_len
406 
407 
408 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
409     size_t off = (target - buf - 8) >> 2;
410     assert(((off & 0xff000000) == 0) ||
411            ((off & 0xff000000) == 0xff000000));
412 
413     uint32_t op = (condition << 28);
414     op |= 0xa << 24;  // branch
415     op |= 0xffffff & off;
416     ((uint32_t *)buf)[0] = op;
417     return buf + 4;
418 }
419 
420 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
421     assert(vd < 32);
422     assert(vm < 32);
423     assert(vn < 32);
424 
425     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
426     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
427     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
428     return op;
429 }
430 
431 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
432                               uint32_t src_d2_s) {
433     //vmlal.s16 Q#1, D#1, D#2[#]
434     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
435     ((uint32_t *)buf)[0] = op;
436     return buf + 4;
437 }
438 
439 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
440                               uint32_t src_d2_s) {
441     //vmull.s16 Q#1, D#1, D#2[#]
442     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
443     ((uint32_t *)buf)[0] = op;
444     return buf + 4;
445 }
446 
447 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448     //vqadd.s32 Q#1, Q#1, Q#2
449     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
450     ((uint32_t *)buf)[0] = op;
451     return buf + 4;
452 }
453 
454 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
455                               uint32_t src_d2_s) {
456     //vmlal.f32 Q#1, D#1, D#2[#]
457     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
458     ((uint32_t *)buf)[0] = op;
459     return buf + 4;
460 }
461 
462 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
463                               uint32_t src_d2_s) {
464     //vmull.f32 Q#1, D#1, D#2[#]
465     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
466     ((uint32_t *)buf)[0] = op;
467     return buf + 4;
468 }
469 
470 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
471     //vadd.f32 Q#1, D#1, D#2
472     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
473     ((uint32_t *)buf)[0] = op;
474     return buf + 4;
475 }
476 
477 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
478     //vmov.32 Q#1, #imm
479     assert(imm == 0);
480     (void) imm; // Avoid unused parameter warnings for non-debug builds
481     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
482     ((uint32_t *)buf)[0] = op;
483     return buf + 4;
484 }
485 
486 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
487     //vadd.f32 Q#1, D#1, D#2
488     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
489     ((uint32_t *)buf)[0] = op;
490     return buf + 4;
491 }
492 #endif
493 
494 #if defined(ARCH_X86_HAVE_SSSE3)
495 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
496                                   const int16_t *coef, uint32_t count);
497 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
498                                   const int16_t *coef, uint32_t count);
499 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
500                                   const int16_t *coef, uint32_t count);
501 
502 void * selectKernel(Key_t key)
503 {
504     void * kernel = nullptr;
505 
506     // inType, outType float if nonzero
507     if (!(key.u.inType || key.u.outType)) {
508         if (key.u.dot)
509             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
510         else if (key.u.copyAlpha)
511             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
512         else
513             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
514     }
515 
516     return kernel;
517 }
518 #endif
519 
520 bool ColorMatrixTask::build(Key_t key) {
521 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
522     mBufSize = 4096;
523     //StopWatch build_time("rs cm: build time");
524     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
525                                   MAP_PRIVATE | MAP_ANON, -1, 0);
526     if (mBuf == MAP_FAILED) {
527         mBuf = NULL;
528         return false;
529     }
530 
531     uint8_t *buf = mBuf;
532     uint8_t *buf2 = nullptr;
533 
534     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
535     int opInit[4] = {0, 0, 0, 0};
536 
537     memset(ops, 0, sizeof(ops));
538     for (int i=0; i < 4; i++) {
539         if (key.u.coeffMask & (1 << (i*4))) {
540             ops[i][0] = 0x2 | opInit[0];
541             opInit[0] = 1;
542         }
543         if (!key.u.dot) {
544             if (key.u.coeffMask & (1 << (1 + i*4))) {
545                 ops[i][1] = 0x2 | opInit[1];
546                 opInit[1] = 1;
547             }
548             if (key.u.coeffMask & (1 << (2 + i*4))) {
549                 ops[i][2] = 0x2 | opInit[2];
550                 opInit[2] = 1;
551             }
552         }
553         if (!key.u.copyAlpha) {
554             if (key.u.coeffMask & (1 << (3 + i*4))) {
555                 ops[i][3] = 0x2 | opInit[3];
556                 opInit[3] = 1;
557             }
558         }
559     }
560 
561     if (key.u.inType || key.u.outType) {
562         key.u.copyAlpha = 0;
563         ADD_CHUNK(prefix_f);
564         buf2 = buf;
565 
566         // Load the incoming r,g,b,a as needed
567         if (key.u.inType) {
568             switch(key.u.inVecSize) {
569             case 3:
570                 ADD_CHUNK(load_f32_4);
571                 break;
572             case 2:
573                 ADD_CHUNK(load_f32_3);
574                 break;
575             case 1:
576                 ADD_CHUNK(load_f32_2);
577                 break;
578             case 0:
579                 ADD_CHUNK(load_f32_1);
580                 break;
581             }
582         } else {
583             switch(key.u.inVecSize) {
584             case 3:
585                 ADD_CHUNK(load_u8f_4);
586                 break;
587             case 2:
588                 ADD_CHUNK(load_u8f_3);
589                 break;
590             case 1:
591                 ADD_CHUNK(load_u8f_2);
592                 break;
593             case 0:
594                 ADD_CHUNK(load_u8f_1);
595                 break;
596             }
597         }
598 
599         for (int i=0; i < 4; i++) {
600             for (int j=0; j < 4; j++) {
601                 switch(ops[i][j]) {
602                 case 0:
603                     break;
604                 case 2:
605                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
606                     break;
607                 case 3:
608                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
609                     break;
610                 }
611             }
612         }
613         for (int j=0; j < 4; j++) {
614             if (opInit[j]) {
615                 if (key.u.addMask & (1 << j)) {
616                     buf = addVADD_F32(buf, j, 12+j, 8+j);
617                 } else {
618                     buf = addVORR_32(buf, j, 12+j, 12+j);
619                 }
620             } else {
621                 if (key.u.addMask & (1 << j)) {
622                     buf = addVORR_32(buf, j, 8+j, 8+j);
623                 } else {
624                     buf = addVMOV_32(buf, j, 0);
625                 }
626             }
627         }
628 
629         if (key.u.outType) {
630             switch(key.u.outVecSize) {
631             case 3:
632                 ADD_CHUNK(store_f32_4);
633                 break;
634             case 2:
635                 ADD_CHUNK(store_f32_3);
636                 break;
637             case 1:
638                 ADD_CHUNK(store_f32_2);
639                 break;
640             case 0:
641                 ADD_CHUNK(store_f32_1);
642                 break;
643             }
644         } else {
645             switch(key.u.outVecSize) {
646             case 3:
647             case 2:
648                 ADD_CHUNK(store_f32u_4);
649                 break;
650             case 1:
651                 ADD_CHUNK(store_f32u_2);
652                 break;
653             case 0:
654                 ADD_CHUNK(store_f32u_1);
655                 break;
656             }
657         }
658 
659 
660     } else {
661         // Add the function prefix
662         // Store the address for the loop return
663         ADD_CHUNK(prefix_i);
664         buf2 = buf;
665 
666         // Load the incoming r,g,b,a as needed
667         switch(key.u.inVecSize) {
668         case 3:
669             ADD_CHUNK(load_u8_4);
670             if (key.u.copyAlpha) {
671                 ADD_CHUNK(unpack_u8_3);
672             } else {
673                 ADD_CHUNK(unpack_u8_4);
674             }
675             break;
676         case 2:
677             ADD_CHUNK(load_u8_3);
678             ADD_CHUNK(unpack_u8_3);
679             break;
680         case 1:
681             ADD_CHUNK(load_u8_2);
682             ADD_CHUNK(unpack_u8_2);
683             break;
684         case 0:
685             ADD_CHUNK(load_u8_1);
686             ADD_CHUNK(unpack_u8_1);
687             break;
688         }
689 
690         // Add multiply and accumulate
691         // use MULL to init the output register,
692         // use MLAL from there
693         for (int i=0; i < 4; i++) {
694             for (int j=0; j < 4; j++) {
695                 switch(ops[i][j]) {
696                 case 0:
697                     break;
698                 case 2:
699                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
700                     break;
701                 case 3:
702                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
703                     break;
704                 }
705             }
706         }
707         for (int j=0; j < 4; j++) {
708             if (opInit[j]) {
709                 if (key.u.addMask & (1 << j)) {
710                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
711                 }
712             } else {
713                 if (key.u.addMask & (1 << j)) {
714                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
715                 }
716             }
717         }
718 
719         // If we have a dot product, perform the special pack.
720         if (key.u.dot) {
721             ADD_CHUNK(pack_u8_1);
722             ADD_CHUNK(dot);
723         } else {
724             switch(key.u.outVecSize) {
725             case 3:
726                 if (key.u.copyAlpha) {
727                     ADD_CHUNK(pack_u8_3);
728                 } else {
729                     ADD_CHUNK(pack_u8_4);
730                 }
731                 break;
732             case 2:
733                 ADD_CHUNK(pack_u8_3);
734                 break;
735             case 1:
736                 ADD_CHUNK(pack_u8_2);
737                 break;
738             case 0:
739                 ADD_CHUNK(pack_u8_1);
740                 break;
741             }
742         }
743 
744         // Write out result
745         switch(key.u.outVecSize) {
746         case 3:
747         case 2:
748             ADD_CHUNK(store_u8_4);
749             break;
750         case 1:
751             ADD_CHUNK(store_u8_2);
752             break;
753         case 0:
754             ADD_CHUNK(store_u8_1);
755             break;
756         }
757     }
758 
759     if (key.u.inType != key.u.outType) {
760         key.u.copyAlpha = 0;
761         key.u.dot = 0;
762     }
763 
764     // Loop, branch, and cleanup
765     ADD_CHUNK(postfix1);
766     buf = addBranch(buf, buf2, 0x01);
767     ADD_CHUNK(postfix2);
768 
769     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
770     if (ret == -1) {
771         ALOGE("mprotect error %i", ret);
772         return false;
773     }
774 
775     __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
776     return true;
777 #else
778     (void) key; // Avoid unused parameter warning.
779     return false;
780 #endif
781 }
782 
783 void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
784     for(int ct=0; ct < 16; ct++) {
785         mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
786         mTmpFp[ct] = mFp[ct] * fpMul;
787         //ALOGE("mat %i %f  %f", ct, mFp[ct], tmpFp[ct]);
788     }
789 
790     float add = 0.f;
791     if (fpMul > 254.f) add = 0.5f;
792     for(int ct=0; ct < 4; ct++) {
793         mTmpFpa[ct] = mFpa[ct] * addMul + add;
794         //ALOGE("mFpa %i %f  %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
795     }
796 
797     for(int ct=0; ct < 4; ct++) {
798         mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
799     }
800 }
801 
802 
803 
804 static void One(void *out,
805                 const void *py, const float* coeff, const float *add,
806                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
807 
808     float4 f = 0.f;
809     if (fin) {
810         switch(vsin) {
811         case 3:
812             f = ((const float4 *)py)[0];
813             break;
814         case 2:
815             f = ((const float4 *)py)[0];
816             f.w = 0.f;
817             break;
818         case 1:
819             f.xy = ((const float2 *)py)[0];
820             break;
821         case 0:
822             f.x = ((const float *)py)[0];
823             break;
824         }
825     } else {
826         switch(vsin) {
827         case 3:
828             f = convert<float4>(((const uchar4 *)py)[0]);
829             break;
830         case 2:
831             f = convert<float4>(((const uchar4 *)py)[0]);
832             f.w = 0.f;
833             break;
834         case 1:
835             f.xy = convert<float2>(((const uchar2 *)py)[0]);
836             break;
837         case 0:
838             f.x = (float)(((const uchar *)py)[0]);
839             break;
840         }
841     }
842     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
843 
844     float4 sum;
845     sum.x = f.x * coeff[0] +
846             f.y * coeff[4] +
847             f.z * coeff[8] +
848             f.w * coeff[12];
849     sum.y = f.x * coeff[1] +
850             f.y * coeff[5] +
851             f.z * coeff[9] +
852             f.w * coeff[13];
853     sum.z = f.x * coeff[2] +
854             f.y * coeff[6] +
855             f.z * coeff[10] +
856             f.w * coeff[14];
857     sum.w = f.x * coeff[3] +
858             f.y * coeff[7] +
859             f.z * coeff[11] +
860             f.w * coeff[15];
861     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
862 
863     sum.x += add[0];
864     sum.y += add[1];
865     sum.z += add[2];
866     sum.w += add[3];
867 
868 
869     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
870     if (fout) {
871         switch(vsout) {
872         case 3:
873         case 2:
874             ((float4 *)out)[0] = sum;
875             break;
876         case 1:
877             ((float2 *)out)[0] = sum.xy;
878             break;
879         case 0:
880             ((float *)out)[0] = sum.x;
881             break;
882         }
883     } else {
884         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
885         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
886         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
887         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
888 
889         switch(vsout) {
890         case 3:
891         case 2:
892             ((uchar4 *)out)[0] = convert<uchar4>(sum);
893             break;
894         case 1:
895             ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
896             break;
897         case 0:
898             ((uchar *)out)[0] = sum.x;
899             break;
900         }
901     }
902     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
903     //      ((float *)out)[3]);
904 }
905 
906 void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
907     uint32_t x1 = xstart;
908     uint32_t x2 = xend;
909 
910     uint32_t vsin = mLastKey.u.inVecSize;
911     uint32_t vsout = mLastKey.u.outVecSize;
912     bool floatIn = !!mLastKey.u.inType;
913     bool floatOut = !!mLastKey.u.outType;
914 
915     //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
916 
917     if(x2 > x1) {
918         int32_t len = x2 - x1;
919         if (mUsesSimd) {
920             if((mOptKernel != nullptr) && (len >= 4)) {
921                 // The optimized kernel processes 4 pixels at once
922                 // and requires a minimum of 1 chunk of 4
923                 mOptKernel(out, in, mIp, len >> 2);
924                 // Update the len and pointers so the generic code can
925                 // finish any leftover pixels
926                 len &= ~3;
927                 x1 += len;
928                 out += mOutstep * len;
929                 in += mInstep * len;
930             }
931 #if defined(ARCH_ARM64_USE_INTRINSICS)
932             else {
933                 if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
934                     mLastKey.u.outType == RS_TYPE_FLOAT_32) {
935                     // Currently this generates off by one errors.
936                     // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
937                     // x1 += len;
938                     // out += outstep * len;
939                     // in += instep * len;
940                 } else {
941                     rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
942                     x1 += len;
943                     out += mOutstep * len;
944                     in += mInstep * len;
945                 }
946             }
947 #endif
948         }
949 
950         while(x1 != x2) {
951             One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
952             out += mOutstep;
953             in += mInstep;
954             x1++;
955         }
956     }
957 }
958 
959 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
960 void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
961                                 int outType) {
962     if (inType == outType) {
963         if (outType == RS_TYPE_UNSIGNED_8) {
964             updateCoeffCache(1.f, 255.f);
965         } else {
966             updateCoeffCache(1.f, 1.f);
967         }
968     } else {
969         if (outType == RS_TYPE_UNSIGNED_8) {
970             updateCoeffCache(255.f, 255.f);
971         } else {
972             updateCoeffCache(1.f / 255.f, 1.f);
973         }
974     }
975 
976     Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
977 #else
978 void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
979     updateCoeffCache(1.f, 255.f);
980 
981     Key_t key = computeKey(inVectorSize, outVectorSize);
982 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
983 
984 #if defined(ARCH_X86_HAVE_SSSE3)
985     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
986         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
987         // mOptKernel =
988         //     (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
989         mLastKey = key;
990     }
991 
992 #else //if !defined(ARCH_X86_HAVE_SSSE3)
993     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
994         if (mBuf) munmap(mBuf, mBufSize);
995         mBuf = nullptr;
996         mOptKernel = nullptr;
997         if (build(key)) {
998             mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
999         }
1000 #if defined(ARCH_ARM64_USE_INTRINSICS)
1001         else {
1002             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
1003             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
1004             uint32_t mm = 0;
1005             int i;
1006             for (i = 0; i < 4; i++)
1007             {
1008                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
1009                 m = ((m * 0x249) >> 9) & 15;
1010                 m |= ((key.u.addMask >> i) & 1) << 4;
1011                 mm |= m << (i * 5);
1012             }
1013 
1014             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
1015                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1016             } else {
1017                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1018             }
1019         }
1020 #endif
1021         mLastKey = key;
1022     }
1023 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1024 }
1025 
1026 void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
1027                                   size_t endY) {
1028     for (size_t y = startY; y < endY; y++) {
1029         size_t offset = mSizeX * y + startX;
1030         uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
1031         uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
1032         kernel(out, in, startX, endX);
1033     }
1034 }
1035 
1036 static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
1037 
1038 void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
1039                                       size_t outputVectorSize, size_t sizeX, size_t sizeY,
1040                                       const float* matrix, const float* addVector,
1041                                       const Restriction* restriction) {
1042 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
1043     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
1044         return;
1045     }
1046     if (inputVectorSize < 1 || inputVectorSize > 4) {
1047         ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
1048         return;
1049     }
1050     if (outputVectorSize < 1 || outputVectorSize > 4) {
1051         ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
1052         return;
1053     }
1054 #endif
1055 
1056     if (addVector == nullptr) {
1057         addVector = fourZeroes;
1058     }
1059     ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
1060                          addVector, restriction);
1061     processor->doTask(&task);
1062 }
1063 
1064 }  // namespace renderscript
1065