1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sys/mman.h>
18 #include <unistd.h>
19 
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 
23 #include <sys/mman.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 //#include <utils/StopWatch.h>
28 
29 
30 /*  uint kernel
31  *  Q0  D0:  Load slot for R
32  *      D1:  Load slot for G
33  *  Q1  D2:  Load slot for B
34  *      D3:  Load slot for A
35  *  Q2  D4:  Matrix
36  *      D5:  =
37  *  Q3  D6:  =
38  *      D7:  =
39  *  Q4  D8:  Add R
40  *      D9:
41  *  Q5  D10: Add G
42  *      D11:
43  *  Q6  D12: Add B
44  *      D13:
45  *  Q7  D14: Add A
46  *      D15:
47  *  Q8  D16:  I32: R Sum
48  *      D17:
49  *  Q9  D18:  I32: G Sum
50  *      D19:
51  *  Q10 D20:  I32: B Sum
52  *      D21:
53  *  Q11 D22:  I32: A Sum
54  *      D23:
55  *  Q12 D24:  U16: expanded R
56  *      D25:
57  *  Q13 D26:  U16: expanded G
58  *      D27:
59  *  Q14 D28:  U16: expanded B
60  *      D29:
61  *  Q15 D30:  U16: expanded A
62  *      D31:
63  *
64  */
65 
66 /*  float kernel
67  *  Q0  D0:  Load slot for R
68  *      D1:  =
69  *  Q1  D2:  Load slot for G
70  *      D3:  =
71  *  Q2  D4:  Load slot for B
72  *      D5:  =
73  *  Q3  D6:  Load slot for A
74  *      D7:  =
75  *  Q4  D8:  Matrix
76  *      D9:  =
77  *  Q5  D10: =
78  *      D11: =
79  *  Q6  D12: =
80  *      D13: =
81  *  Q7  D14: =
82  *      D15: =
83  *  Q8  D16: Add R
84  *      D17: =
85  *  Q9  D18: Add G
86  *      D19: =
87  *  Q10 D20: Add B
88  *      D21: =
89  *  Q11 D22: Add A
90  *      D23: =
91  *  Q12 D24: Sum R
92  *      D25: =
93  *  Q13 D26: Sum G
94  *      D27: =
95  *  Q14 D28: Sum B
96  *      D29: =
97  *  Q15 D30: Sum A
98  *      D31: =
99  *
100  */
101 
102 
103 
104 namespace android {
105 namespace renderscript {
106 
107 typedef union {
108     uint64_t key;
109     struct {
110         uint32_t inVecSize          :2;  // [0 - 1]
111         uint32_t outVecSize         :2;  // [2 - 3]
112         uint32_t inType             :4;  // [4 - 7]
113         uint32_t outType            :4;  // [8 - 11]
114         uint32_t dot                :1;  // [12]
115         uint32_t _unused1           :1;  // [13]
116         uint32_t copyAlpha          :1;  // [14]
117         uint32_t _unused2           :1;  // [15]
118         uint32_t coeffMask          :16; // [16-31]
119         uint32_t addMask            :4;  // [32-35]
120     } u;
121 } Key_t;
122 
123 //Re-enable when intrinsic is fixed
124 #if defined(ARCH_ARM64_USE_INTRINSICS)
125 typedef struct {
126     void (*column[4])(void);
127     void (*store)(void);
128     void (*load)(void);
129     void (*store_end)(void);
130     void (*load_end)(void);
131 } FunctionTab_t;
132 
133 extern "C" void rsdIntrinsicColorMatrix_int_K(
134              void *out, void const *in, size_t count,
135              FunctionTab_t const *fns,
136              int16_t const *mult, int32_t const *add);
137 
138 extern "C" void rsdIntrinsicColorMatrix_float_K(
139              void *out, void const *in, size_t count,
140              FunctionTab_t const *fns,
141              float const *mult, float const *add);
142 
143 /* The setup functions fill in function tables to be used by above functions;
144  * this code also eliminates jump-to-another-jump cases by short-circuiting
145  * empty functions.  While it's not performance critical, it works out easier
146  * to write the set-up code in assembly than to try to expose the same symbols
147  * and write the code in C.
148  */
149 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
150              FunctionTab_t *fns,
151              uint32_t mask, int dt, int st);
152 
153 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
154              FunctionTab_t *fns,
155              uint32_t mask, int dt, int st);
156 #endif
157 
158 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
159 public:
160     void populateScript(Script *) override;
161 
162     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
163 
164     ~RsdCpuScriptIntrinsicColorMatrix() override;
165     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
166 
167     void preLaunch(uint32_t slot, const Allocation ** ains,
168                    uint32_t inLen, Allocation * aout, const void * usr,
169                    uint32_t usrLen, const RsScriptCall *sc) override;
170 
171 protected:
172     float fp[16];
173     float fpa[4];
174 
175     // The following four fields are read as constants
176     // by the SIMD assembly code.
177     int16_t ip[16];
178     int ipa[4];
179     float tmpFp[16];
180     float tmpFpa[4];
181 #if defined(ARCH_ARM64_USE_INTRINSICS)
182     FunctionTab_t mFnTab;
183 #endif
184 
185     static void kernel(const RsExpandKernelDriverInfo *info,
186                        uint32_t xstart, uint32_t xend,
187                        uint32_t outstep);
188     void updateCoeffCache(float fpMul, float addMul);
189 
190     Key_t mLastKey;
191     unsigned char *mBuf;
192     size_t mBufSize;
193 
194     Key_t computeKey(const Element *ein, const Element *eout);
195 
196     bool build(Key_t key);
197 
198     void (*mOptKernel)(void *dst, const void *src, const int16_t *coef, uint32_t count);
199 
200 };
201 
202 
computeKey(const Element * ein,const Element * eout)203 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
204         const Element *ein, const Element *eout) {
205 
206     Key_t key;
207     key.key = 0;
208 
209     // Compute a unique code key for this operation
210 
211     // Add to the key the input and output types
212     bool hasFloat = false;
213     if (ein->getType() == RS_TYPE_FLOAT_32) {
214         hasFloat = true;
215         key.u.inType = RS_TYPE_FLOAT_32;
216         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
217     }
218     if (eout->getType() == RS_TYPE_FLOAT_32) {
219         hasFloat = true;
220         key.u.outType = RS_TYPE_FLOAT_32;
221         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
222     }
223 
224     // Mask in the bits indicating which coefficients in the
225     // color matrix are needed.
226     if (hasFloat) {
227         for (uint32_t i=0; i < 16; i++) {
228             if (fabs(fp[i]) != 0.f) {
229                 key.u.coeffMask |= 1 << i;
230             }
231         }
232         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
233         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
234         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
235         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
236 
237     } else {
238         for (uint32_t i=0; i < 16; i++) {
239             if (ip[i] != 0) {
240                 key.u.coeffMask |= 1 << i;
241             }
242         }
243         if (ipa[0] != 0) key.u.addMask |= 0x1;
244         if (ipa[1] != 0) key.u.addMask |= 0x2;
245         if (ipa[2] != 0) key.u.addMask |= 0x4;
246         if (ipa[3] != 0) key.u.addMask |= 0x8;
247     }
248 
249     // Look for a dot product where the r,g,b colums are the same
250     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
251         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
252         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
253         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
254 
255         if (!key.u.addMask) key.u.dot = 1;
256     }
257 
258     // Is alpha a simple copy
259     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
260         key.u.copyAlpha = !(key.u.inType || key.u.outType);
261     }
262 
263     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
264 
265     switch (ein->getVectorSize()) {
266     case 4:
267         key.u.inVecSize = 3;
268         break;
269     case 3:
270         key.u.inVecSize = 2;
271         key.u.coeffMask &= ~0xF000;
272         break;
273     case 2:
274         key.u.inVecSize = 1;
275         key.u.coeffMask &= ~0xFF00;
276         break;
277     default:
278         key.u.coeffMask &= ~0xFFF0;
279         break;
280     }
281 
282     switch (eout->getVectorSize()) {
283     case 4:
284         key.u.outVecSize = 3;
285         break;
286     case 3:
287         key.u.outVecSize = 2;
288         key.u.coeffMask &= ~0x8888;
289         key.u.addMask &= 7;
290         break;
291     case 2:
292         key.u.outVecSize = 1;
293         key.u.coeffMask &= ~0xCCCC;
294         key.u.addMask &= 3;
295         break;
296     default:
297         key.u.coeffMask &= ~0xEEEE;
298         key.u.addMask &= 1;
299         break;
300     }
301 
302     if (key.u.inType && !key.u.outType) {
303         key.u.addMask |= 1;
304         if (key.u.outVecSize > 0) key.u.addMask |= 2;
305         if (key.u.outVecSize > 1) key.u.addMask |= 4;
306         if (key.u.outVecSize > 2) key.u.addMask |= 8;
307     }
308 
309     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
310     return key;
311 }
312 
313 } // namespace renderscript
314 } // namespace android
315 
316 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
317 
318 #define DEF_SYM(x)                                  \
319     extern "C" uint32_t _N_ColorMatrix_##x;      \
320     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
321     extern "C" uint32_t _N_ColorMatrix_##x##_len;
322 
323 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)324 DEF_SYM(prefix_f)
325 DEF_SYM(postfix1)
326 DEF_SYM(postfix2)
327 
328 DEF_SYM(load_u8_4)
329 DEF_SYM(load_u8_3)
330 DEF_SYM(load_u8_2)
331 DEF_SYM(load_u8_1)
332 DEF_SYM(load_u8f_4)
333 DEF_SYM(load_u8f_3)
334 DEF_SYM(load_u8f_2)
335 DEF_SYM(load_u8f_1)
336 DEF_SYM(load_f32_4)
337 DEF_SYM(load_f32_3)
338 DEF_SYM(load_f32_2)
339 DEF_SYM(load_f32_1)
340 
341 DEF_SYM(store_u8_4)
342 DEF_SYM(store_u8_2)
343 DEF_SYM(store_u8_1)
344 DEF_SYM(store_f32_4)
345 DEF_SYM(store_f32_3)
346 DEF_SYM(store_f32_2)
347 DEF_SYM(store_f32_1)
348 DEF_SYM(store_f32u_4)
349 DEF_SYM(store_f32u_2)
350 DEF_SYM(store_f32u_1)
351 
352 DEF_SYM(unpack_u8_4)
353 DEF_SYM(unpack_u8_3)
354 DEF_SYM(unpack_u8_2)
355 DEF_SYM(unpack_u8_1)
356 DEF_SYM(pack_u8_4)
357 DEF_SYM(pack_u8_3)
358 DEF_SYM(pack_u8_2)
359 DEF_SYM(pack_u8_1)
360 DEF_SYM(dot)
361 DEF_SYM(add_0_u8)
362 DEF_SYM(add_1_u8)
363 DEF_SYM(add_2_u8)
364 DEF_SYM(add_3_u8)
365 
366 #define ADD_CHUNK(x) \
367     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
368     buf += _N_ColorMatrix_##x##_len
369 
370 
371 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
372     size_t off = (target - buf - 8) >> 2;
373     rsAssert(((off & 0xff000000) == 0) ||
374            ((off & 0xff000000) == 0xff000000));
375 
376     uint32_t op = (condition << 28);
377     op |= 0xa << 24;  // branch
378     op |= 0xffffff & off;
379     ((uint32_t *)buf)[0] = op;
380     return buf + 4;
381 }
382 
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)383 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
384     rsAssert(vd < 32);
385     rsAssert(vm < 32);
386     rsAssert(vn < 32);
387 
388     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
389     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
390     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
391     return op;
392 }
393 
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)394 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
395     //vmlal.s16 Q#1, D#1, D#2[#]
396     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
397     ((uint32_t *)buf)[0] = op;
398     return buf + 4;
399 }
400 
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)401 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
402     //vmull.s16 Q#1, D#1, D#2[#]
403     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
404     ((uint32_t *)buf)[0] = op;
405     return buf + 4;
406 }
407 
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)408 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
409     //vqadd.s32 Q#1, Q#1, Q#2
410     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
411     ((uint32_t *)buf)[0] = op;
412     return buf + 4;
413 }
414 
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)415 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
416     //vmlal.f32 Q#1, D#1, D#2[#]
417     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
418     ((uint32_t *)buf)[0] = op;
419     return buf + 4;
420 }
421 
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)422 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
423     //vmull.f32 Q#1, D#1, D#2[#]
424     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
425     ((uint32_t *)buf)[0] = op;
426     return buf + 4;
427 }
428 
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)429 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
430     //vadd.f32 Q#1, D#1, D#2
431     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
432     ((uint32_t *)buf)[0] = op;
433     return buf + 4;
434 }
435 
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)436 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
437     //vmov.32 Q#1, #imm
438     rsAssert(imm == 0);
439     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
440     ((uint32_t *)buf)[0] = op;
441     return buf + 4;
442 }
443 
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)444 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
445     //vadd.f32 Q#1, D#1, D#2
446     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
447     ((uint32_t *)buf)[0] = op;
448     return buf + 4;
449 }
450 #endif
451 
452 #if defined(ARCH_X86_HAVE_SSSE3)
453 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
454                                   const int16_t *coef, uint32_t count);
455 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
456                                   const int16_t *coef, uint32_t count);
457 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
458                                   const int16_t *coef, uint32_t count);
459 
460 using android::renderscript::Key_t;
461 
selectKernel(Key_t key)462 void * selectKernel(Key_t key)
463 {
464     void * kernel = nullptr;
465 
466     // inType, outType float if nonzero
467     if (!(key.u.inType || key.u.outType)) {
468         if (key.u.dot)
469             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
470         else if (key.u.copyAlpha)
471             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
472         else
473             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
474     }
475 
476     return kernel;
477 }
478 #endif
479 
480 namespace android {
481 namespace renderscript {
482 
build(Key_t key)483 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
484 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
485     mBufSize = 4096;
486     //StopWatch build_time("rs cm: build time");
487     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
488                                   MAP_PRIVATE | MAP_ANON, -1, 0);
489     if (mBuf == MAP_FAILED) {
490         mBuf = NULL;
491         return false;
492     }
493 
494     uint8_t *buf = mBuf;
495     uint8_t *buf2 = nullptr;
496 
497     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
498     int opInit[4] = {0, 0, 0, 0};
499 
500     memset(ops, 0, sizeof(ops));
501     for (int i=0; i < 4; i++) {
502         if (key.u.coeffMask & (1 << (i*4))) {
503             ops[i][0] = 0x2 | opInit[0];
504             opInit[0] = 1;
505         }
506         if (!key.u.dot) {
507             if (key.u.coeffMask & (1 << (1 + i*4))) {
508                 ops[i][1] = 0x2 | opInit[1];
509                 opInit[1] = 1;
510             }
511             if (key.u.coeffMask & (1 << (2 + i*4))) {
512                 ops[i][2] = 0x2 | opInit[2];
513                 opInit[2] = 1;
514             }
515         }
516         if (!key.u.copyAlpha) {
517             if (key.u.coeffMask & (1 << (3 + i*4))) {
518                 ops[i][3] = 0x2 | opInit[3];
519                 opInit[3] = 1;
520             }
521         }
522     }
523 
524     if (key.u.inType || key.u.outType) {
525         key.u.copyAlpha = 0;
526         ADD_CHUNK(prefix_f);
527         buf2 = buf;
528 
529         // Load the incoming r,g,b,a as needed
530         if (key.u.inType) {
531             switch(key.u.inVecSize) {
532             case 3:
533                 ADD_CHUNK(load_f32_4);
534                 break;
535             case 2:
536                 ADD_CHUNK(load_f32_3);
537                 break;
538             case 1:
539                 ADD_CHUNK(load_f32_2);
540                 break;
541             case 0:
542                 ADD_CHUNK(load_f32_1);
543                 break;
544             }
545         } else {
546             switch(key.u.inVecSize) {
547             case 3:
548                 ADD_CHUNK(load_u8f_4);
549                 break;
550             case 2:
551                 ADD_CHUNK(load_u8f_3);
552                 break;
553             case 1:
554                 ADD_CHUNK(load_u8f_2);
555                 break;
556             case 0:
557                 ADD_CHUNK(load_u8f_1);
558                 break;
559             }
560         }
561 
562         for (int i=0; i < 4; i++) {
563             for (int j=0; j < 4; j++) {
564                 switch(ops[i][j]) {
565                 case 0:
566                     break;
567                 case 2:
568                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
569                     break;
570                 case 3:
571                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
572                     break;
573                 }
574             }
575         }
576         for (int j=0; j < 4; j++) {
577             if (opInit[j]) {
578                 if (key.u.addMask & (1 << j)) {
579                     buf = addVADD_F32(buf, j, 12+j, 8+j);
580                 } else {
581                     buf = addVORR_32(buf, j, 12+j, 12+j);
582                 }
583             } else {
584                 if (key.u.addMask & (1 << j)) {
585                     buf = addVORR_32(buf, j, 8+j, 8+j);
586                 } else {
587                     buf = addVMOV_32(buf, j, 0);
588                 }
589             }
590         }
591 
592         if (key.u.outType) {
593             switch(key.u.outVecSize) {
594             case 3:
595                 ADD_CHUNK(store_f32_4);
596                 break;
597             case 2:
598                 ADD_CHUNK(store_f32_3);
599                 break;
600             case 1:
601                 ADD_CHUNK(store_f32_2);
602                 break;
603             case 0:
604                 ADD_CHUNK(store_f32_1);
605                 break;
606             }
607         } else {
608             switch(key.u.outVecSize) {
609             case 3:
610             case 2:
611                 ADD_CHUNK(store_f32u_4);
612                 break;
613             case 1:
614                 ADD_CHUNK(store_f32u_2);
615                 break;
616             case 0:
617                 ADD_CHUNK(store_f32u_1);
618                 break;
619             }
620         }
621 
622 
623     } else {
624         // Add the function prefix
625         // Store the address for the loop return
626         ADD_CHUNK(prefix_i);
627         buf2 = buf;
628 
629         // Load the incoming r,g,b,a as needed
630         switch(key.u.inVecSize) {
631         case 3:
632             ADD_CHUNK(load_u8_4);
633             if (key.u.copyAlpha) {
634                 ADD_CHUNK(unpack_u8_3);
635             } else {
636                 ADD_CHUNK(unpack_u8_4);
637             }
638             break;
639         case 2:
640             ADD_CHUNK(load_u8_3);
641             ADD_CHUNK(unpack_u8_3);
642             break;
643         case 1:
644             ADD_CHUNK(load_u8_2);
645             ADD_CHUNK(unpack_u8_2);
646             break;
647         case 0:
648             ADD_CHUNK(load_u8_1);
649             ADD_CHUNK(unpack_u8_1);
650             break;
651         }
652 
653         // Add multiply and accumulate
654         // use MULL to init the output register,
655         // use MLAL from there
656         for (int i=0; i < 4; i++) {
657             for (int j=0; j < 4; j++) {
658                 switch(ops[i][j]) {
659                 case 0:
660                     break;
661                 case 2:
662                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
663                     break;
664                 case 3:
665                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
666                     break;
667                 }
668             }
669         }
670         for (int j=0; j < 4; j++) {
671             if (opInit[j]) {
672                 if (key.u.addMask & (1 << j)) {
673                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
674                 }
675             } else {
676                 if (key.u.addMask & (1 << j)) {
677                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
678                 }
679             }
680         }
681 
682         // If we have a dot product, perform the special pack.
683         if (key.u.dot) {
684             ADD_CHUNK(pack_u8_1);
685             ADD_CHUNK(dot);
686         } else {
687             switch(key.u.outVecSize) {
688             case 3:
689                 if (key.u.copyAlpha) {
690                     ADD_CHUNK(pack_u8_3);
691                 } else {
692                     ADD_CHUNK(pack_u8_4);
693                 }
694                 break;
695             case 2:
696                 ADD_CHUNK(pack_u8_3);
697                 break;
698             case 1:
699                 ADD_CHUNK(pack_u8_2);
700                 break;
701             case 0:
702                 ADD_CHUNK(pack_u8_1);
703                 break;
704             }
705         }
706 
707         // Write out result
708         switch(key.u.outVecSize) {
709         case 3:
710         case 2:
711             ADD_CHUNK(store_u8_4);
712             break;
713         case 1:
714             ADD_CHUNK(store_u8_2);
715             break;
716         case 0:
717             ADD_CHUNK(store_u8_1);
718             break;
719         }
720     }
721 
722     if (key.u.inType != key.u.outType) {
723         key.u.copyAlpha = 0;
724         key.u.dot = 0;
725     }
726 
727     // Loop, branch, and cleanup
728     ADD_CHUNK(postfix1);
729     buf = addBranch(buf, buf2, 0x01);
730     ADD_CHUNK(postfix2);
731 
732     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
733     if (ret == -1) {
734         ALOGE("mprotect error %i", ret);
735         return false;
736     }
737 
738     __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
739     return true;
740 #else
741     return false;
742 #endif
743 }
744 
updateCoeffCache(float fpMul,float addMul)745 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
746     for(int ct=0; ct < 16; ct++) {
747         ip[ct] = (int16_t)(fp[ct] * 256.f + 0.5f);
748         tmpFp[ct] = fp[ct] * fpMul;
749         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
750     }
751 
752     float add = 0.f;
753     if (fpMul > 254.f) add = 0.5f;
754     for(int ct=0; ct < 4; ct++) {
755         tmpFpa[ct] = fpa[ct] * addMul + add;
756         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
757     }
758 
759     for(int ct=0; ct < 4; ct++) {
760         ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
761     }
762 }
763 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)764 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
765                                                     size_t dataLength) {
766     switch(slot) {
767     case 0:
768         memcpy (fp, data, sizeof(fp));
769         break;
770     case 1:
771         memcpy (fpa, data, sizeof(fpa));
772         break;
773     default:
774         rsAssert(0);
775         break;
776     }
777     mRootPtr = &kernel;
778 }
779 
780 
One(const RsExpandKernelDriverInfo * info,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)781 static void One(const RsExpandKernelDriverInfo *info, void *out,
782                 const void *py, const float* coeff, const float *add,
783                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
784 
785     float4 f = 0.f;
786     if (fin) {
787         switch(vsin) {
788         case 3:
789             f = ((const float4 *)py)[0];
790             break;
791         case 2:
792             f = ((const float4 *)py)[0];
793             f.w = 0.f;
794             break;
795         case 1:
796             f.xy = ((const float2 *)py)[0];
797             break;
798         case 0:
799             f.x = ((const float *)py)[0];
800             break;
801         }
802     } else {
803         switch(vsin) {
804         case 3:
805             f = convert_float4(((const uchar4 *)py)[0]);
806             break;
807         case 2:
808             f = convert_float4(((const uchar4 *)py)[0]);
809             f.w = 0.f;
810             break;
811         case 1:
812             f.xy = convert_float2(((const uchar2 *)py)[0]);
813             break;
814         case 0:
815             f.x = (float)(((const uchar *)py)[0]);
816             break;
817         }
818     }
819     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
820 
821     float4 sum;
822     sum.x = f.x * coeff[0] +
823             f.y * coeff[4] +
824             f.z * coeff[8] +
825             f.w * coeff[12];
826     sum.y = f.x * coeff[1] +
827             f.y * coeff[5] +
828             f.z * coeff[9] +
829             f.w * coeff[13];
830     sum.z = f.x * coeff[2] +
831             f.y * coeff[6] +
832             f.z * coeff[10] +
833             f.w * coeff[14];
834     sum.w = f.x * coeff[3] +
835             f.y * coeff[7] +
836             f.z * coeff[11] +
837             f.w * coeff[15];
838     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
839 
840     sum.x += add[0];
841     sum.y += add[1];
842     sum.z += add[2];
843     sum.w += add[3];
844 
845 
846     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
847     if (fout) {
848         switch(vsout) {
849         case 3:
850         case 2:
851             ((float4 *)out)[0] = sum;
852             break;
853         case 1:
854             ((float2 *)out)[0] = sum.xy;
855             break;
856         case 0:
857             ((float *)out)[0] = sum.x;
858             break;
859         }
860     } else {
861         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
862         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
863         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
864         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
865 
866         switch(vsout) {
867         case 3:
868         case 2:
869             ((uchar4 *)out)[0] = convert_uchar4(sum);
870             break;
871         case 1:
872             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
873             break;
874         case 0:
875             ((uchar *)out)[0] = sum.x;
876             break;
877         }
878     }
879     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
880 }
881 
kernel(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)882 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
883                                               uint32_t xstart, uint32_t xend,
884                                               uint32_t outstep) {
885     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
886 
887     uint32_t instep = info->inStride[0];
888 
889     uchar *out = (uchar *)info->outPtr[0];
890     uchar *in = (uchar *)info->inPtr[0];
891     uint32_t x1 = xstart;
892     uint32_t x2 = xend;
893 
894     uint32_t vsin = cp->mLastKey.u.inVecSize;
895     uint32_t vsout = cp->mLastKey.u.outVecSize;
896     bool floatIn = !!cp->mLastKey.u.inType;
897     bool floatOut = !!cp->mLastKey.u.outType;
898 
899     //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
900 
901     if(x2 > x1) {
902         int32_t len = x2 - x1;
903         if (gArchUseSIMD) {
904             if((cp->mOptKernel != nullptr) && (len >= 4)) {
905                 // The optimized kernel processes 4 pixels at once
906                 // and requires a minimum of 1 chunk of 4
907                 cp->mOptKernel(out, in, cp->ip, len >> 2);
908                 // Update the len and pointers so the generic code can
909                 // finish any leftover pixels
910                 len &= ~3;
911                 x1 += len;
912                 out += outstep * len;
913                 in += instep * len;
914             }
915 #if defined(ARCH_ARM64_USE_INTRINSICS)
916             else {
917                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
918                     // Currently this generates off by one errors.
919                     //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
920                     //x1 += len;
921                     //out += outstep * len;
922                     //in += instep * len;
923                 } else {
924                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
925                     x1 += len;
926                     out += outstep * len;
927                     in += instep * len;
928                 }
929             }
930 #endif
931         }
932 
933         while(x1 != x2) {
934             One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
935             out += outstep;
936             in += instep;
937             x1++;
938         }
939     }
940 }
941 
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)942 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
943                                                  const Allocation ** ains,
944                                                  uint32_t inLen,
945                                                  Allocation * aout,
946                                                  const void * usr,
947                                                  uint32_t usrLen,
948                                                  const RsScriptCall *sc) {
949 
950     const Element *ein = ains[0]->mHal.state.type->getElement();
951     const Element *eout = aout->mHal.state.type->getElement();
952 
953     if (ein->getType() == eout->getType()) {
954         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955             updateCoeffCache(1.f, 255.f);
956         } else {
957             updateCoeffCache(1.f, 1.f);
958         }
959     } else {
960         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
961             updateCoeffCache(255.f, 255.f);
962         } else {
963             updateCoeffCache(1.f / 255.f, 1.f);
964         }
965     }
966 
967     Key_t key = computeKey(ein, eout);
968 
969 #if defined(ARCH_X86_HAVE_SSSE3)
970     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
971         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
972         // mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
973         mLastKey = key;
974     }
975 
976 #else //if !defined(ARCH_X86_HAVE_SSSE3)
977     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
978         if (mBuf) munmap(mBuf, mBufSize);
979         mBuf = nullptr;
980         mOptKernel = nullptr;
981         if (build(key)) {
982             mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
983         }
984 #if defined(ARCH_ARM64_USE_INTRINSICS)
985         else {
986             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
987             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
988             uint32_t mm = 0;
989             int i;
990             for (i = 0; i < 4; i++)
991             {
992                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
993                 m = ((m * 0x249) >> 9) & 15;
994                 m |= ((key.u.addMask >> i) & 1) << 4;
995                 mm |= m << (i * 5);
996             }
997 
998             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
999                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1000             } else {
1001                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1002             }
1003         }
1004 #endif
1005         mLastKey = key;
1006     }
1007 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1008 }
1009 
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013 
1014     mLastKey.key = 0;
1015     mBuf = nullptr;
1016     mBufSize = 0;
1017     mOptKernel = nullptr;
1018     const static float defaultMatrix[] = {
1019         1.f, 0.f, 0.f, 0.f,
1020         0.f, 1.f, 0.f, 0.f,
1021         0.f, 0.f, 1.f, 0.f,
1022         0.f, 0.f, 0.f, 1.f
1023     };
1024     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027 }
1028 
~RsdCpuScriptIntrinsicColorMatrix()1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030     if (mBuf) munmap(mBuf, mBufSize);
1031     mBuf = nullptr;
1032     mOptKernel = nullptr;
1033 }
1034 
populateScript(Script * s)1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036     s->mHal.info.exportedVariableCount = 2;
1037 }
1038 
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040                                             const Script *s, const Element *e) {
1041 
1042     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043 }
1044 
1045 } // namespace renderscript
1046 } // namespace android
1047