1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "RenderScriptToolkit.h"
18 #include "TaskProcessor.h"
19 #include "Utils.h"
20 #include <assert.h>
21 #include <cstdint>
22 #include <sys/mman.h>
23 
24 namespace android {
25 namespace renderscript {
26 
27 #define LOG_TAG "renderscript.toolkit.ColorMatrix"
28 
29 /*  uint kernel
30  *  Q0  D0:  Load slot for R
31  *      D1:  Load slot for G
32  *  Q1  D2:  Load slot for B
33  *      D3:  Load slot for A
34  *  Q2  D4:  Matrix
35  *      D5:  =
36  *  Q3  D6:  =
37  *      D7:  =
38  *  Q4  D8:  Add R
39  *      D9:
40  *  Q5  D10: Add G
41  *      D11:
42  *  Q6  D12: Add B
43  *      D13:
44  *  Q7  D14: Add A
45  *      D15:
46  *  Q8  D16:  I32: R Sum
47  *      D17:
48  *  Q9  D18:  I32: G Sum
49  *      D19:
50  *  Q10 D20:  I32: B Sum
51  *      D21:
52  *  Q11 D22:  I32: A Sum
53  *      D23:
54  *  Q12 D24:  U16: expanded R
55  *      D25:
56  *  Q13 D26:  U16: expanded G
57  *      D27:
58  *  Q14 D28:  U16: expanded B
59  *      D29:
60  *  Q15 D30:  U16: expanded A
61  *      D31:
62  *
63  */
64 
65 /*  float kernel
66  *  Q0  D0:  Load slot for R
67  *      D1:  =
68  *  Q1  D2:  Load slot for G
69  *      D3:  =
70  *  Q2  D4:  Load slot for B
71  *      D5:  =
72  *  Q3  D6:  Load slot for A
73  *      D7:  =
74  *  Q4  D8:  Matrix
75  *      D9:  =
76  *  Q5  D10: =
77  *      D11: =
78  *  Q6  D12: =
79  *      D13: =
80  *  Q7  D14: =
81  *      D15: =
82  *  Q8  D16: Add R
83  *      D17: =
84  *  Q9  D18: Add G
85  *      D19: =
86  *  Q10 D20: Add B
87  *      D21: =
88  *  Q11 D22: Add A
89  *      D23: =
90  *  Q12 D24: Sum R
91  *      D25: =
92  *  Q13 D26: Sum G
93  *      D27: =
94  *  Q14 D28: Sum B
95  *      D29: =
96  *  Q15 D30: Sum A
97  *      D31: =
98  *
99  */
100 
101 typedef union {
102     uint64_t key;
103     struct {
104         uint32_t inVecSize          :2;  // [0 - 1]
105         uint32_t outVecSize         :2;  // [2 - 3]
106         uint32_t inType             :4;  // [4 - 7]
107         uint32_t outType            :4;  // [8 - 11]
108         uint32_t dot                :1;  // [12]
109         uint32_t _unused1           :1;  // [13]
110         uint32_t copyAlpha          :1;  // [14]
111         uint32_t _unused2           :1;  // [15]
112         uint32_t coeffMask          :16; // [16-31]
113         uint32_t addMask            :4;  // [32-35]
114     } u;
115 } Key_t;
116 
117 /* The two data types and their value, as specified in the RenderScript documentation.
118  * Only RS_TYPE_UNSIGNED_8 is currently supported.
119  *
120  * TODO: The actual values of these constants are likely not important. We may be
121  * able to simplify the key related code.
122  */
123 const int RS_TYPE_UNSIGNED_8 = 8;
124 const int RS_TYPE_FLOAT_32 = 2;
125 
126 //Re-enable when intrinsic is fixed
127 #if defined(ARCH_ARM64_USE_INTRINSICS)
128 typedef struct {
129     void (*column[4])(void);
130     void (*store)(void);
131     void (*load)(void);
132     void (*store_end)(void);
133     void (*load_end)(void);
134 } FunctionTab_t;
135 
136 extern "C" void rsdIntrinsicColorMatrix_int_K(
137              void *out, void const *in, size_t count,
138              FunctionTab_t const *fns,
139              int16_t const *mult, int32_t const *add);
140 
141 extern "C" void rsdIntrinsicColorMatrix_float_K(
142              void *out, void const *in, size_t count,
143              FunctionTab_t const *fns,
144              float const *mult, float const *add);
145 
146 /* The setup functions fill in function tables to be used by above functions;
147  * this code also eliminates jump-to-another-jump cases by short-circuiting
148  * empty functions.  While it's not performance critical, it works out easier
149  * to write the set-up code in assembly than to try to expose the same symbols
150  * and write the code in C.
151  */
152 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153              FunctionTab_t *fns,
154              uint32_t mask, int dt, int st);
155 
156 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
157              FunctionTab_t *fns,
158              uint32_t mask, int dt, int st);
159 #endif
160 
161 class ColorMatrixTask : public Task {
162     const void* mIn;
163     void* mOut;
164     size_t mInputVectorSize;
165     uint32_t mOutstep;
166     uint32_t mInstep;
167 
168     float mFp[16];
169     float mFpa[4];
170 
171     // The following four fields are read as constants
172     // by the SIMD assembly code.
173     int16_t mIp[16];
174     int mIpa[4];
175     float mTmpFp[16];
176     float mTmpFpa[4];
177 #if defined(ARCH_ARM64_USE_INTRINSICS)
178     FunctionTab_t mFnTab;
179 #endif
180 
181     void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
182     void updateCoeffCache(float fpMul, float addMul);
183 
184     Key_t mLastKey;
185     unsigned char* mBuf;
186     size_t mBufSize;
187 
188     bool build(Key_t key);
189     void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
190 
191 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
192     Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
193     void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
194 #else
195     Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
196     void preLaunch(size_t inVectorSize, size_t outVectorSize);
197 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
198 
199     // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
200     virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
201                              size_t endY) override;
202 
203    public:
ColorMatrixTask(const void * in,void * out,size_t inputVectorSize,size_t outputVectorSize,size_t sizeX,size_t sizeY,const float * matrix,const float * addVector,const Restriction * restriction)204     ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
205                     size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
206                     const Restriction* restriction)
207         : Task{sizeX, sizeY, outputVectorSize, true, restriction},
208           mIn{in},
209           mOut{out},
210           mInputVectorSize{inputVectorSize} {
211         mLastKey.key = 0;
212         mBuf = nullptr;
213         mBufSize = 0;
214         mOptKernel = nullptr;
215 
216         mOutstep = paddedSize(outputVectorSize);
217         mInstep = paddedSize(inputVectorSize);
218 
219         memcpy(mFp, matrix, sizeof(mFp));
220         memcpy(mFpa, addVector, sizeof(mFpa));
221 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
222         // For float support, we'll have to pass the type in the constructor too.
223         preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
224 #else
225         preLaunch(inputVectorSize, outputVectorSize);
226 #endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
227     }
~ColorMatrixTask()228     ~ColorMatrixTask() {
229         if (mBuf) munmap(mBuf, mBufSize);
230         mBuf = nullptr;
231         mOptKernel = nullptr;
232     }
233 };
234 
235 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
computeKey(size_t inVectorSize,int inType,size_t outVectorSize,int outType)236 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
237                                   int outType) {
238     Key_t key;
239     key.key = 0;
240 
241     // Compute a unique code key for this operation
242 
243     // Add to the key the input and output types
244     bool hasFloat = false;
245     if (inType == RS_TYPE_FLOAT_32) {
246         hasFloat = true;
247         key.u.inType = RS_TYPE_FLOAT_32;
248     }
249     if (outType == RS_TYPE_FLOAT_32) {
250         hasFloat = true;
251         key.u.outType = RS_TYPE_FLOAT_32;
252     }
253 
254     // Mask in the bits indicating which coefficients in the
255     // color matrix are needed.
256     if (hasFloat) {
257         for (uint32_t i=0; i < 16; i++) {
258             if (fabs(mFp[i]) != 0.f) {
259                 key.u.coeffMask |= 1 << i;
260             }
261         }
262         if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
263         if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
264         if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
265         if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
266 
267     } else {
268 #else
269 Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
270     Key_t key;
271     key.key = 0;
272 
273     // Compute a unique code key for this operation
274     {
275 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
276 
277         for (uint32_t i=0; i < 16; i++) {
278             if (mIp[i] != 0) {
279                 key.u.coeffMask |= 1 << i;
280             }
281         }
282         if (mIpa[0] != 0) key.u.addMask |= 0x1;
283         if (mIpa[1] != 0) key.u.addMask |= 0x2;
284         if (mIpa[2] != 0) key.u.addMask |= 0x4;
285         if (mIpa[3] != 0) key.u.addMask |= 0x8;
286     }
287 
288     // Look for a dot product where the r,g,b colums are the same
289     if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
290         (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
291         (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
292         (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
293 
294         if (!key.u.addMask) key.u.dot = 1;
295     }
296 
297     // Is alpha a simple copy
298     if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
299         key.u.copyAlpha = !(key.u.inType || key.u.outType);
300     }
301 
302     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
303 
304     switch (inVectorSize) {
305     case 4:
306         key.u.inVecSize = 3;
307         break;
308     case 3:
309         key.u.inVecSize = 2;
310         key.u.coeffMask &= ~0xF000;
311         break;
312     case 2:
313         key.u.inVecSize = 1;
314         key.u.coeffMask &= ~0xFF00;
315         break;
316     default:
317         key.u.coeffMask &= ~0xFFF0;
318         break;
319     }
320 
321     switch (outVectorSize) {
322     case 4:
323         key.u.outVecSize = 3;
324         break;
325     case 3:
326         key.u.outVecSize = 2;
327         key.u.coeffMask &= ~0x8888;
328         key.u.addMask &= 7;
329         break;
330     case 2:
331         key.u.outVecSize = 1;
332         key.u.coeffMask &= ~0xCCCC;
333         key.u.addMask &= 3;
334         break;
335     default:
336         key.u.coeffMask &= ~0xEEEE;
337         key.u.addMask &= 1;
338         break;
339     }
340 
341     if (key.u.inType && !key.u.outType) {
342         key.u.addMask |= 1;
343         if (key.u.outVecSize > 0) key.u.addMask |= 2;
344         if (key.u.outVecSize > 1) key.u.addMask |= 4;
345         if (key.u.outVecSize > 2) key.u.addMask |= 8;
346     }
347 
348     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
349     return key;
350 }
351 
352 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
353 
354 #define DEF_SYM(x)                                  \
355     extern "C" uint32_t _N_ColorMatrix_##x;      \
356     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
357     extern "C" uint32_t _N_ColorMatrix_##x##_len;
358 
359 DEF_SYM(prefix_i)
360 DEF_SYM(prefix_f)
361 DEF_SYM(postfix1)
362 DEF_SYM(postfix2)
363 
364 DEF_SYM(load_u8_4)
365 DEF_SYM(load_u8_3)
366 DEF_SYM(load_u8_2)
367 DEF_SYM(load_u8_1)
368 DEF_SYM(load_u8f_4)
369 DEF_SYM(load_u8f_3)
370 DEF_SYM(load_u8f_2)
371 DEF_SYM(load_u8f_1)
372 DEF_SYM(load_f32_4)
373 DEF_SYM(load_f32_3)
374 DEF_SYM(load_f32_2)
375 DEF_SYM(load_f32_1)
376 
377 DEF_SYM(store_u8_4)
378 DEF_SYM(store_u8_2)
379 DEF_SYM(store_u8_1)
380 DEF_SYM(store_f32_4)
381 DEF_SYM(store_f32_3)
382 DEF_SYM(store_f32_2)
383 DEF_SYM(store_f32_1)
384 DEF_SYM(store_f32u_4)
385 DEF_SYM(store_f32u_2)
386 DEF_SYM(store_f32u_1)
387 
388 DEF_SYM(unpack_u8_4)
389 DEF_SYM(unpack_u8_3)
390 DEF_SYM(unpack_u8_2)
391 DEF_SYM(unpack_u8_1)
392 DEF_SYM(pack_u8_4)
393 DEF_SYM(pack_u8_3)
394 DEF_SYM(pack_u8_2)
395 DEF_SYM(pack_u8_1)
396 DEF_SYM(dot)
397 DEF_SYM(add_0_u8)
398 DEF_SYM(add_1_u8)
399 DEF_SYM(add_2_u8)
400 DEF_SYM(add_3_u8)
401 
402 #define ADD_CHUNK(x) \
403     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
404     buf += _N_ColorMatrix_##x##_len
405 
406 
407 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
408     size_t off = (target - buf - 8) >> 2;
409     assert(((off & 0xff000000) == 0) ||
410            ((off & 0xff000000) == 0xff000000));
411 
412     uint32_t op = (condition << 28);
413     op |= 0xa << 24;  // branch
414     op |= 0xffffff & off;
415     ((uint32_t *)buf)[0] = op;
416     return buf + 4;
417 }
418 
419 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
420     assert(vd < 32);
421     assert(vm < 32);
422     assert(vn < 32);
423 
424     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
425     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
426     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
427     return op;
428 }
429 
430 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
431                               uint32_t src_d2_s) {
432     //vmlal.s16 Q#1, D#1, D#2[#]
433     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
434     ((uint32_t *)buf)[0] = op;
435     return buf + 4;
436 }
437 
438 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
439                               uint32_t src_d2_s) {
440     //vmull.s16 Q#1, D#1, D#2[#]
441     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
442     ((uint32_t *)buf)[0] = op;
443     return buf + 4;
444 }
445 
446 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
447     //vqadd.s32 Q#1, Q#1, Q#2
448     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
449     ((uint32_t *)buf)[0] = op;
450     return buf + 4;
451 }
452 
453 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
454                               uint32_t src_d2_s) {
455     //vmlal.f32 Q#1, D#1, D#2[#]
456     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
457     ((uint32_t *)buf)[0] = op;
458     return buf + 4;
459 }
460 
461 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
462                               uint32_t src_d2_s) {
463     //vmull.f32 Q#1, D#1, D#2[#]
464     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
465     ((uint32_t *)buf)[0] = op;
466     return buf + 4;
467 }
468 
469 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
470     //vadd.f32 Q#1, D#1, D#2
471     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
472     ((uint32_t *)buf)[0] = op;
473     return buf + 4;
474 }
475 
476 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
477     //vmov.32 Q#1, #imm
478     assert(imm == 0);
479     (void) imm; // Avoid unused parameter warnings for non-debug builds
480     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
481     ((uint32_t *)buf)[0] = op;
482     return buf + 4;
483 }
484 
485 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
486     //vadd.f32 Q#1, D#1, D#2
487     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
488     ((uint32_t *)buf)[0] = op;
489     return buf + 4;
490 }
491 #endif
492 
493 #if defined(ARCH_X86_HAVE_SSSE3)
494 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
495                                   const int16_t *coef, uint32_t count);
496 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
497                                   const int16_t *coef, uint32_t count);
498 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
499                                   const int16_t *coef, uint32_t count);
500 
501 using android::renderscript::Key_t;
502 
503 void * selectKernel(Key_t key)
504 {
505     void * kernel = nullptr;
506 
507     // inType, outType float if nonzero
508     if (!(key.u.inType || key.u.outType)) {
509         if (key.u.dot)
510             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
511         else if (key.u.copyAlpha)
512             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
513         else
514             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
515     }
516 
517     return kernel;
518 }
519 #endif
520 
521 bool ColorMatrixTask::build(Key_t key) {
522 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
523     mBufSize = 4096;
524     //StopWatch build_time("rs cm: build time");
525     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
526                                   MAP_PRIVATE | MAP_ANON, -1, 0);
527     if (mBuf == MAP_FAILED) {
528         mBuf = NULL;
529         return false;
530     }
531 
532     uint8_t *buf = mBuf;
533     uint8_t *buf2 = nullptr;
534 
535     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
536     int opInit[4] = {0, 0, 0, 0};
537 
538     memset(ops, 0, sizeof(ops));
539     for (int i=0; i < 4; i++) {
540         if (key.u.coeffMask & (1 << (i*4))) {
541             ops[i][0] = 0x2 | opInit[0];
542             opInit[0] = 1;
543         }
544         if (!key.u.dot) {
545             if (key.u.coeffMask & (1 << (1 + i*4))) {
546                 ops[i][1] = 0x2 | opInit[1];
547                 opInit[1] = 1;
548             }
549             if (key.u.coeffMask & (1 << (2 + i*4))) {
550                 ops[i][2] = 0x2 | opInit[2];
551                 opInit[2] = 1;
552             }
553         }
554         if (!key.u.copyAlpha) {
555             if (key.u.coeffMask & (1 << (3 + i*4))) {
556                 ops[i][3] = 0x2 | opInit[3];
557                 opInit[3] = 1;
558             }
559         }
560     }
561 
562     if (key.u.inType || key.u.outType) {
563         key.u.copyAlpha = 0;
564         ADD_CHUNK(prefix_f);
565         buf2 = buf;
566 
567         // Load the incoming r,g,b,a as needed
568         if (key.u.inType) {
569             switch(key.u.inVecSize) {
570             case 3:
571                 ADD_CHUNK(load_f32_4);
572                 break;
573             case 2:
574                 ADD_CHUNK(load_f32_3);
575                 break;
576             case 1:
577                 ADD_CHUNK(load_f32_2);
578                 break;
579             case 0:
580                 ADD_CHUNK(load_f32_1);
581                 break;
582             }
583         } else {
584             switch(key.u.inVecSize) {
585             case 3:
586                 ADD_CHUNK(load_u8f_4);
587                 break;
588             case 2:
589                 ADD_CHUNK(load_u8f_3);
590                 break;
591             case 1:
592                 ADD_CHUNK(load_u8f_2);
593                 break;
594             case 0:
595                 ADD_CHUNK(load_u8f_1);
596                 break;
597             }
598         }
599 
600         for (int i=0; i < 4; i++) {
601             for (int j=0; j < 4; j++) {
602                 switch(ops[i][j]) {
603                 case 0:
604                     break;
605                 case 2:
606                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
607                     break;
608                 case 3:
609                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
610                     break;
611                 }
612             }
613         }
614         for (int j=0; j < 4; j++) {
615             if (opInit[j]) {
616                 if (key.u.addMask & (1 << j)) {
617                     buf = addVADD_F32(buf, j, 12+j, 8+j);
618                 } else {
619                     buf = addVORR_32(buf, j, 12+j, 12+j);
620                 }
621             } else {
622                 if (key.u.addMask & (1 << j)) {
623                     buf = addVORR_32(buf, j, 8+j, 8+j);
624                 } else {
625                     buf = addVMOV_32(buf, j, 0);
626                 }
627             }
628         }
629 
630         if (key.u.outType) {
631             switch(key.u.outVecSize) {
632             case 3:
633                 ADD_CHUNK(store_f32_4);
634                 break;
635             case 2:
636                 ADD_CHUNK(store_f32_3);
637                 break;
638             case 1:
639                 ADD_CHUNK(store_f32_2);
640                 break;
641             case 0:
642                 ADD_CHUNK(store_f32_1);
643                 break;
644             }
645         } else {
646             switch(key.u.outVecSize) {
647             case 3:
648             case 2:
649                 ADD_CHUNK(store_f32u_4);
650                 break;
651             case 1:
652                 ADD_CHUNK(store_f32u_2);
653                 break;
654             case 0:
655                 ADD_CHUNK(store_f32u_1);
656                 break;
657             }
658         }
659 
660 
661     } else {
662         // Add the function prefix
663         // Store the address for the loop return
664         ADD_CHUNK(prefix_i);
665         buf2 = buf;
666 
667         // Load the incoming r,g,b,a as needed
668         switch(key.u.inVecSize) {
669         case 3:
670             ADD_CHUNK(load_u8_4);
671             if (key.u.copyAlpha) {
672                 ADD_CHUNK(unpack_u8_3);
673             } else {
674                 ADD_CHUNK(unpack_u8_4);
675             }
676             break;
677         case 2:
678             ADD_CHUNK(load_u8_3);
679             ADD_CHUNK(unpack_u8_3);
680             break;
681         case 1:
682             ADD_CHUNK(load_u8_2);
683             ADD_CHUNK(unpack_u8_2);
684             break;
685         case 0:
686             ADD_CHUNK(load_u8_1);
687             ADD_CHUNK(unpack_u8_1);
688             break;
689         }
690 
691         // Add multiply and accumulate
692         // use MULL to init the output register,
693         // use MLAL from there
694         for (int i=0; i < 4; i++) {
695             for (int j=0; j < 4; j++) {
696                 switch(ops[i][j]) {
697                 case 0:
698                     break;
699                 case 2:
700                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
701                     break;
702                 case 3:
703                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
704                     break;
705                 }
706             }
707         }
708         for (int j=0; j < 4; j++) {
709             if (opInit[j]) {
710                 if (key.u.addMask & (1 << j)) {
711                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
712                 }
713             } else {
714                 if (key.u.addMask & (1 << j)) {
715                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
716                 }
717             }
718         }
719 
720         // If we have a dot product, perform the special pack.
721         if (key.u.dot) {
722             ADD_CHUNK(pack_u8_1);
723             ADD_CHUNK(dot);
724         } else {
725             switch(key.u.outVecSize) {
726             case 3:
727                 if (key.u.copyAlpha) {
728                     ADD_CHUNK(pack_u8_3);
729                 } else {
730                     ADD_CHUNK(pack_u8_4);
731                 }
732                 break;
733             case 2:
734                 ADD_CHUNK(pack_u8_3);
735                 break;
736             case 1:
737                 ADD_CHUNK(pack_u8_2);
738                 break;
739             case 0:
740                 ADD_CHUNK(pack_u8_1);
741                 break;
742             }
743         }
744 
745         // Write out result
746         switch(key.u.outVecSize) {
747         case 3:
748         case 2:
749             ADD_CHUNK(store_u8_4);
750             break;
751         case 1:
752             ADD_CHUNK(store_u8_2);
753             break;
754         case 0:
755             ADD_CHUNK(store_u8_1);
756             break;
757         }
758     }
759 
760     if (key.u.inType != key.u.outType) {
761         key.u.copyAlpha = 0;
762         key.u.dot = 0;
763     }
764 
765     // Loop, branch, and cleanup
766     ADD_CHUNK(postfix1);
767     buf = addBranch(buf, buf2, 0x01);
768     ADD_CHUNK(postfix2);
769 
770     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
771     if (ret == -1) {
772         ALOGE("mprotect error %i", ret);
773         return false;
774     }
775 
776     __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
777     return true;
778 #else
779     (void) key; // Avoid unused parameter warning.
780     return false;
781 #endif
782 }
783 
784 void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
785     for(int ct=0; ct < 16; ct++) {
786         mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
787         mTmpFp[ct] = mFp[ct] * fpMul;
788         //ALOGE("mat %i %f  %f", ct, mFp[ct], tmpFp[ct]);
789     }
790 
791     float add = 0.f;
792     if (fpMul > 254.f) add = 0.5f;
793     for(int ct=0; ct < 4; ct++) {
794         mTmpFpa[ct] = mFpa[ct] * addMul + add;
795         //ALOGE("mFpa %i %f  %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
796     }
797 
798     for(int ct=0; ct < 4; ct++) {
799         mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
800     }
801 }
802 
803 
804 
805 static void One(void *out,
806                 const void *py, const float* coeff, const float *add,
807                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
808 
809     float4 f = 0.f;
810     if (fin) {
811         switch(vsin) {
812         case 3:
813             f = ((const float4 *)py)[0];
814             break;
815         case 2:
816             f = ((const float4 *)py)[0];
817             f.w = 0.f;
818             break;
819         case 1:
820             f.xy = ((const float2 *)py)[0];
821             break;
822         case 0:
823             f.x = ((const float *)py)[0];
824             break;
825         }
826     } else {
827         switch(vsin) {
828         case 3:
829             f = convert<float4>(((const uchar4 *)py)[0]);
830             break;
831         case 2:
832             f = convert<float4>(((const uchar4 *)py)[0]);
833             f.w = 0.f;
834             break;
835         case 1:
836             f.xy = convert<float2>(((const uchar2 *)py)[0]);
837             break;
838         case 0:
839             f.x = (float)(((const uchar *)py)[0]);
840             break;
841         }
842     }
843     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
844 
845     float4 sum;
846     sum.x = f.x * coeff[0] +
847             f.y * coeff[4] +
848             f.z * coeff[8] +
849             f.w * coeff[12];
850     sum.y = f.x * coeff[1] +
851             f.y * coeff[5] +
852             f.z * coeff[9] +
853             f.w * coeff[13];
854     sum.z = f.x * coeff[2] +
855             f.y * coeff[6] +
856             f.z * coeff[10] +
857             f.w * coeff[14];
858     sum.w = f.x * coeff[3] +
859             f.y * coeff[7] +
860             f.z * coeff[11] +
861             f.w * coeff[15];
862     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
863 
864     sum.x += add[0];
865     sum.y += add[1];
866     sum.z += add[2];
867     sum.w += add[3];
868 
869 
870     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
871     if (fout) {
872         switch(vsout) {
873         case 3:
874         case 2:
875             ((float4 *)out)[0] = sum;
876             break;
877         case 1:
878             ((float2 *)out)[0] = sum.xy;
879             break;
880         case 0:
881             ((float *)out)[0] = sum.x;
882             break;
883         }
884     } else {
885         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
886         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
887         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
888         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
889 
890         switch(vsout) {
891         case 3:
892         case 2:
893             ((uchar4 *)out)[0] = convert<uchar4>(sum);
894             break;
895         case 1:
896             ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
897             break;
898         case 0:
899             ((uchar *)out)[0] = sum.x;
900             break;
901         }
902     }
903     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
904     //      ((float *)out)[3]);
905 }
906 
907 void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
908     uint32_t x1 = xstart;
909     uint32_t x2 = xend;
910 
911     uint32_t vsin = mLastKey.u.inVecSize;
912     uint32_t vsout = mLastKey.u.outVecSize;
913     bool floatIn = !!mLastKey.u.inType;
914     bool floatOut = !!mLastKey.u.outType;
915 
916     //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
917 
918     if(x2 > x1) {
919         int32_t len = x2 - x1;
920         if (mUsesSimd) {
921             if((mOptKernel != nullptr) && (len >= 4)) {
922                 // The optimized kernel processes 4 pixels at once
923                 // and requires a minimum of 1 chunk of 4
924                 mOptKernel(out, in, mIp, len >> 2);
925                 // Update the len and pointers so the generic code can
926                 // finish any leftover pixels
927                 len &= ~3;
928                 x1 += len;
929                 out += mOutstep * len;
930                 in += mInstep * len;
931             }
932 #if defined(ARCH_ARM64_USE_INTRINSICS)
933             else {
934                 if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
935                     mLastKey.u.outType == RS_TYPE_FLOAT_32) {
936                     // Currently this generates off by one errors.
937                     // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
938                     // x1 += len;
939                     // out += outstep * len;
940                     // in += instep * len;
941                 } else {
942                     rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
943                     x1 += len;
944                     out += mOutstep * len;
945                     in += mInstep * len;
946                 }
947             }
948 #endif
949         }
950 
951         while(x1 != x2) {
952             One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
953             out += mOutstep;
954             in += mInstep;
955             x1++;
956         }
957     }
958 }
959 
960 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
961 void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
962                                 int outType) {
963     if (inType == outType) {
964         if (outType == RS_TYPE_UNSIGNED_8) {
965             updateCoeffCache(1.f, 255.f);
966         } else {
967             updateCoeffCache(1.f, 1.f);
968         }
969     } else {
970         if (outType == RS_TYPE_UNSIGNED_8) {
971             updateCoeffCache(255.f, 255.f);
972         } else {
973             updateCoeffCache(1.f / 255.f, 1.f);
974         }
975     }
976 
977     Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
978 #else
979 void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
980     updateCoeffCache(1.f, 255.f);
981 
982     Key_t key = computeKey(inVectorSize, outVectorSize);
983 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
984 
985 #if defined(ARCH_X86_HAVE_SSSE3)
986     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
987         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
988         // mOptKernel =
989         //     (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
990         mLastKey = key;
991     }
992 
993 #else //if !defined(ARCH_X86_HAVE_SSSE3)
994     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
995         if (mBuf) munmap(mBuf, mBufSize);
996         mBuf = nullptr;
997         mOptKernel = nullptr;
998         if (build(key)) {
999             mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
1000         }
1001 #if defined(ARCH_ARM64_USE_INTRINSICS)
1002         else {
1003             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
1004             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
1005             uint32_t mm = 0;
1006             int i;
1007             for (i = 0; i < 4; i++)
1008             {
1009                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
1010                 m = ((m * 0x249) >> 9) & 15;
1011                 m |= ((key.u.addMask >> i) & 1) << 4;
1012                 mm |= m << (i * 5);
1013             }
1014 
1015             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
1016                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1017             } else {
1018                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1019             }
1020         }
1021 #endif
1022         mLastKey = key;
1023     }
1024 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1025 }
1026 
1027 void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
1028                                   size_t endY) {
1029     for (size_t y = startY; y < endY; y++) {
1030         size_t offset = mSizeX * y + startX;
1031         uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
1032         uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
1033         kernel(out, in, startX, endX);
1034     }
1035 }
1036 
1037 static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
1038 
1039 void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
1040                                       size_t outputVectorSize, size_t sizeX, size_t sizeY,
1041                                       const float* matrix, const float* addVector,
1042                                       const Restriction* restriction) {
1043 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
1044     if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
1045         return;
1046     }
1047     if (inputVectorSize < 1 || inputVectorSize > 4) {
1048         ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
1049         return;
1050     }
1051     if (outputVectorSize < 1 || outputVectorSize > 4) {
1052         ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
1053         return;
1054     }
1055 #endif
1056 
1057     if (addVector == nullptr) {
1058         addVector = fourZeroes;
1059     }
1060     ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
1061                          addVector, restriction);
1062     processor->doTask(&task);
1063 }
1064 
1065 }  // namespace renderscript
1066 }  // namespace android
1067