1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <sys/mman.h>
18 #include <unistd.h>
19
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22
23 #include <sys/mman.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 //#include <utils/StopWatch.h>
28
29
30 /* uint kernel
31 * Q0 D0: Load slot for R
32 * D1: Load slot for G
33 * Q1 D2: Load slot for B
34 * D3: Load slot for A
35 * Q2 D4: Matrix
36 * D5: =
37 * Q3 D6: =
38 * D7: =
39 * Q4 D8: Add R
40 * D9:
41 * Q5 D10: Add G
42 * D11:
43 * Q6 D12: Add B
44 * D13:
45 * Q7 D14: Add A
46 * D15:
47 * Q8 D16: I32: R Sum
48 * D17:
49 * Q9 D18: I32: G Sum
50 * D19:
51 * Q10 D20: I32: B Sum
52 * D21:
53 * Q11 D22: I32: A Sum
54 * D23:
55 * Q12 D24: U16: expanded R
56 * D25:
57 * Q13 D26: U16: expanded G
58 * D27:
59 * Q14 D28: U16: expanded B
60 * D29:
61 * Q15 D30: U16: expanded A
62 * D31:
63 *
64 */
65
66 /* float kernel
67 * Q0 D0: Load slot for R
68 * D1: =
69 * Q1 D2: Load slot for G
70 * D3: =
71 * Q2 D4: Load slot for B
72 * D5: =
73 * Q3 D6: Load slot for A
74 * D7: =
75 * Q4 D8: Matrix
76 * D9: =
77 * Q5 D10: =
78 * D11: =
79 * Q6 D12: =
80 * D13: =
81 * Q7 D14: =
82 * D15: =
83 * Q8 D16: Add R
84 * D17: =
85 * Q9 D18: Add G
86 * D19: =
87 * Q10 D20: Add B
88 * D21: =
89 * Q11 D22: Add A
90 * D23: =
91 * Q12 D24: Sum R
92 * D25: =
93 * Q13 D26: Sum G
94 * D27: =
95 * Q14 D28: Sum B
96 * D29: =
97 * Q15 D30: Sum A
98 * D31: =
99 *
100 */
101
102
103
104 namespace android {
105 namespace renderscript {
106
107 typedef union {
108 uint64_t key;
109 struct {
110 uint32_t inVecSize :2; // [0 - 1]
111 uint32_t outVecSize :2; // [2 - 3]
112 uint32_t inType :4; // [4 - 7]
113 uint32_t outType :4; // [8 - 11]
114 uint32_t dot :1; // [12]
115 uint32_t _unused1 :1; // [13]
116 uint32_t copyAlpha :1; // [14]
117 uint32_t _unused2 :1; // [15]
118 uint32_t coeffMask :16; // [16-31]
119 uint32_t addMask :4; // [32-35]
120 } u;
121 } Key_t;
122
123 //Re-enable when intrinsic is fixed
124 #if defined(ARCH_ARM64_USE_INTRINSICS)
125 typedef struct {
126 void (*column[4])(void);
127 void (*store)(void);
128 void (*load)(void);
129 void (*store_end)(void);
130 void (*load_end)(void);
131 } FunctionTab_t;
132
133 extern "C" void rsdIntrinsicColorMatrix_int_K(
134 void *out, void const *in, size_t count,
135 FunctionTab_t const *fns,
136 int16_t const *mult, int32_t const *add);
137
138 extern "C" void rsdIntrinsicColorMatrix_float_K(
139 void *out, void const *in, size_t count,
140 FunctionTab_t const *fns,
141 float const *mult, float const *add);
142
143 /* The setup functions fill in function tables to be used by above functions;
144 * this code also eliminates jump-to-another-jump cases by short-circuiting
145 * empty functions. While it's not performance critical, it works out easier
146 * to write the set-up code in assembly than to try to expose the same symbols
147 * and write the code in C.
148 */
149 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
150 FunctionTab_t *fns,
151 uint32_t mask, int dt, int st);
152
153 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
154 FunctionTab_t *fns,
155 uint32_t mask, int dt, int st);
156 #endif
157
158 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
159 public:
160 void populateScript(Script *) override;
161
162 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
163
164 ~RsdCpuScriptIntrinsicColorMatrix() override;
165 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
166
167 void preLaunch(uint32_t slot, const Allocation ** ains,
168 uint32_t inLen, Allocation * aout, const void * usr,
169 uint32_t usrLen, const RsScriptCall *sc) override;
170
171 protected:
172 float fp[16];
173 float fpa[4];
174
175 // The following four fields are read as constants
176 // by the SIMD assembly code.
177 int16_t ip[16];
178 int ipa[4];
179 float tmpFp[16];
180 float tmpFpa[4];
181 #if defined(ARCH_ARM64_USE_INTRINSICS)
182 FunctionTab_t mFnTab;
183 #endif
184
185 static void kernel(const RsExpandKernelDriverInfo *info,
186 uint32_t xstart, uint32_t xend,
187 uint32_t outstep);
188 void updateCoeffCache(float fpMul, float addMul);
189
190 Key_t mLastKey;
191 unsigned char *mBuf;
192 size_t mBufSize;
193
194 Key_t computeKey(const Element *ein, const Element *eout);
195
196 bool build(Key_t key);
197
198 void (*mOptKernel)(void *dst, const void *src, const int16_t *coef, uint32_t count);
199
200 };
201
202
computeKey(const Element * ein,const Element * eout)203 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
204 const Element *ein, const Element *eout) {
205
206 Key_t key;
207 key.key = 0;
208
209 // Compute a unique code key for this operation
210
211 // Add to the key the input and output types
212 bool hasFloat = false;
213 if (ein->getType() == RS_TYPE_FLOAT_32) {
214 hasFloat = true;
215 key.u.inType = RS_TYPE_FLOAT_32;
216 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
217 }
218 if (eout->getType() == RS_TYPE_FLOAT_32) {
219 hasFloat = true;
220 key.u.outType = RS_TYPE_FLOAT_32;
221 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
222 }
223
224 // Mask in the bits indicating which coefficients in the
225 // color matrix are needed.
226 if (hasFloat) {
227 for (uint32_t i=0; i < 16; i++) {
228 if (fabs(fp[i]) != 0.f) {
229 key.u.coeffMask |= 1 << i;
230 }
231 }
232 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
233 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
234 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
235 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
236
237 } else {
238 for (uint32_t i=0; i < 16; i++) {
239 if (ip[i] != 0) {
240 key.u.coeffMask |= 1 << i;
241 }
242 }
243 if (ipa[0] != 0) key.u.addMask |= 0x1;
244 if (ipa[1] != 0) key.u.addMask |= 0x2;
245 if (ipa[2] != 0) key.u.addMask |= 0x4;
246 if (ipa[3] != 0) key.u.addMask |= 0x8;
247 }
248
249 // Look for a dot product where the r,g,b colums are the same
250 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
251 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
252 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
253 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
254
255 if (!key.u.addMask) key.u.dot = 1;
256 }
257
258 // Is alpha a simple copy
259 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
260 key.u.copyAlpha = !(key.u.inType || key.u.outType);
261 }
262
263 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
264
265 switch (ein->getVectorSize()) {
266 case 4:
267 key.u.inVecSize = 3;
268 break;
269 case 3:
270 key.u.inVecSize = 2;
271 key.u.coeffMask &= ~0xF000;
272 break;
273 case 2:
274 key.u.inVecSize = 1;
275 key.u.coeffMask &= ~0xFF00;
276 break;
277 default:
278 key.u.coeffMask &= ~0xFFF0;
279 break;
280 }
281
282 switch (eout->getVectorSize()) {
283 case 4:
284 key.u.outVecSize = 3;
285 break;
286 case 3:
287 key.u.outVecSize = 2;
288 key.u.coeffMask &= ~0x8888;
289 key.u.addMask &= 7;
290 break;
291 case 2:
292 key.u.outVecSize = 1;
293 key.u.coeffMask &= ~0xCCCC;
294 key.u.addMask &= 3;
295 break;
296 default:
297 key.u.coeffMask &= ~0xEEEE;
298 key.u.addMask &= 1;
299 break;
300 }
301
302 if (key.u.inType && !key.u.outType) {
303 key.u.addMask |= 1;
304 if (key.u.outVecSize > 0) key.u.addMask |= 2;
305 if (key.u.outVecSize > 1) key.u.addMask |= 4;
306 if (key.u.outVecSize > 2) key.u.addMask |= 8;
307 }
308
309 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
310 return key;
311 }
312
313 } // namespace renderscript
314 } // namespace android
315
316 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
317
318 #define DEF_SYM(x) \
319 extern "C" uint32_t _N_ColorMatrix_##x; \
320 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
321 extern "C" uint32_t _N_ColorMatrix_##x##_len;
322
323 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)324 DEF_SYM(prefix_f)
325 DEF_SYM(postfix1)
326 DEF_SYM(postfix2)
327
328 DEF_SYM(load_u8_4)
329 DEF_SYM(load_u8_3)
330 DEF_SYM(load_u8_2)
331 DEF_SYM(load_u8_1)
332 DEF_SYM(load_u8f_4)
333 DEF_SYM(load_u8f_3)
334 DEF_SYM(load_u8f_2)
335 DEF_SYM(load_u8f_1)
336 DEF_SYM(load_f32_4)
337 DEF_SYM(load_f32_3)
338 DEF_SYM(load_f32_2)
339 DEF_SYM(load_f32_1)
340
341 DEF_SYM(store_u8_4)
342 DEF_SYM(store_u8_2)
343 DEF_SYM(store_u8_1)
344 DEF_SYM(store_f32_4)
345 DEF_SYM(store_f32_3)
346 DEF_SYM(store_f32_2)
347 DEF_SYM(store_f32_1)
348 DEF_SYM(store_f32u_4)
349 DEF_SYM(store_f32u_2)
350 DEF_SYM(store_f32u_1)
351
352 DEF_SYM(unpack_u8_4)
353 DEF_SYM(unpack_u8_3)
354 DEF_SYM(unpack_u8_2)
355 DEF_SYM(unpack_u8_1)
356 DEF_SYM(pack_u8_4)
357 DEF_SYM(pack_u8_3)
358 DEF_SYM(pack_u8_2)
359 DEF_SYM(pack_u8_1)
360 DEF_SYM(dot)
361 DEF_SYM(add_0_u8)
362 DEF_SYM(add_1_u8)
363 DEF_SYM(add_2_u8)
364 DEF_SYM(add_3_u8)
365
366 #define ADD_CHUNK(x) \
367 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
368 buf += _N_ColorMatrix_##x##_len
369
370
371 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
372 size_t off = (target - buf - 8) >> 2;
373 rsAssert(((off & 0xff000000) == 0) ||
374 ((off & 0xff000000) == 0xff000000));
375
376 uint32_t op = (condition << 28);
377 op |= 0xa << 24; // branch
378 op |= 0xffffff & off;
379 ((uint32_t *)buf)[0] = op;
380 return buf + 4;
381 }
382
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)383 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
384 rsAssert(vd < 32);
385 rsAssert(vm < 32);
386 rsAssert(vn < 32);
387
388 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
389 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
390 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
391 return op;
392 }
393
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)394 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
395 //vmlal.s16 Q#1, D#1, D#2[#]
396 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
397 ((uint32_t *)buf)[0] = op;
398 return buf + 4;
399 }
400
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)401 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
402 //vmull.s16 Q#1, D#1, D#2[#]
403 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
404 ((uint32_t *)buf)[0] = op;
405 return buf + 4;
406 }
407
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)408 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
409 //vqadd.s32 Q#1, Q#1, Q#2
410 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
411 ((uint32_t *)buf)[0] = op;
412 return buf + 4;
413 }
414
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)415 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
416 //vmlal.f32 Q#1, D#1, D#2[#]
417 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
418 ((uint32_t *)buf)[0] = op;
419 return buf + 4;
420 }
421
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)422 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
423 //vmull.f32 Q#1, D#1, D#2[#]
424 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
425 ((uint32_t *)buf)[0] = op;
426 return buf + 4;
427 }
428
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)429 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
430 //vadd.f32 Q#1, D#1, D#2
431 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
432 ((uint32_t *)buf)[0] = op;
433 return buf + 4;
434 }
435
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)436 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
437 //vmov.32 Q#1, #imm
438 rsAssert(imm == 0);
439 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
440 ((uint32_t *)buf)[0] = op;
441 return buf + 4;
442 }
443
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)444 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
445 //vadd.f32 Q#1, D#1, D#2
446 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
447 ((uint32_t *)buf)[0] = op;
448 return buf + 4;
449 }
450 #endif
451
452 #if defined(ARCH_X86_HAVE_SSSE3)
453 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
454 const int16_t *coef, uint32_t count);
455 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
456 const int16_t *coef, uint32_t count);
457 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
458 const int16_t *coef, uint32_t count);
459
460 using android::renderscript::Key_t;
461
selectKernel(Key_t key)462 void * selectKernel(Key_t key)
463 {
464 void * kernel = nullptr;
465
466 // inType, outType float if nonzero
467 if (!(key.u.inType || key.u.outType)) {
468 if (key.u.dot)
469 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
470 else if (key.u.copyAlpha)
471 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
472 else
473 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
474 }
475
476 return kernel;
477 }
478 #endif
479
480 namespace android {
481 namespace renderscript {
482
build(Key_t key)483 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
484 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
485 mBufSize = 4096;
486 //StopWatch build_time("rs cm: build time");
487 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
488 MAP_PRIVATE | MAP_ANON, -1, 0);
489 if (mBuf == MAP_FAILED) {
490 mBuf = NULL;
491 return false;
492 }
493
494 uint8_t *buf = mBuf;
495 uint8_t *buf2 = nullptr;
496
497 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
498 int opInit[4] = {0, 0, 0, 0};
499
500 memset(ops, 0, sizeof(ops));
501 for (int i=0; i < 4; i++) {
502 if (key.u.coeffMask & (1 << (i*4))) {
503 ops[i][0] = 0x2 | opInit[0];
504 opInit[0] = 1;
505 }
506 if (!key.u.dot) {
507 if (key.u.coeffMask & (1 << (1 + i*4))) {
508 ops[i][1] = 0x2 | opInit[1];
509 opInit[1] = 1;
510 }
511 if (key.u.coeffMask & (1 << (2 + i*4))) {
512 ops[i][2] = 0x2 | opInit[2];
513 opInit[2] = 1;
514 }
515 }
516 if (!key.u.copyAlpha) {
517 if (key.u.coeffMask & (1 << (3 + i*4))) {
518 ops[i][3] = 0x2 | opInit[3];
519 opInit[3] = 1;
520 }
521 }
522 }
523
524 if (key.u.inType || key.u.outType) {
525 key.u.copyAlpha = 0;
526 ADD_CHUNK(prefix_f);
527 buf2 = buf;
528
529 // Load the incoming r,g,b,a as needed
530 if (key.u.inType) {
531 switch(key.u.inVecSize) {
532 case 3:
533 ADD_CHUNK(load_f32_4);
534 break;
535 case 2:
536 ADD_CHUNK(load_f32_3);
537 break;
538 case 1:
539 ADD_CHUNK(load_f32_2);
540 break;
541 case 0:
542 ADD_CHUNK(load_f32_1);
543 break;
544 }
545 } else {
546 switch(key.u.inVecSize) {
547 case 3:
548 ADD_CHUNK(load_u8f_4);
549 break;
550 case 2:
551 ADD_CHUNK(load_u8f_3);
552 break;
553 case 1:
554 ADD_CHUNK(load_u8f_2);
555 break;
556 case 0:
557 ADD_CHUNK(load_u8f_1);
558 break;
559 }
560 }
561
562 for (int i=0; i < 4; i++) {
563 for (int j=0; j < 4; j++) {
564 switch(ops[i][j]) {
565 case 0:
566 break;
567 case 2:
568 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
569 break;
570 case 3:
571 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
572 break;
573 }
574 }
575 }
576 for (int j=0; j < 4; j++) {
577 if (opInit[j]) {
578 if (key.u.addMask & (1 << j)) {
579 buf = addVADD_F32(buf, j, 12+j, 8+j);
580 } else {
581 buf = addVORR_32(buf, j, 12+j, 12+j);
582 }
583 } else {
584 if (key.u.addMask & (1 << j)) {
585 buf = addVORR_32(buf, j, 8+j, 8+j);
586 } else {
587 buf = addVMOV_32(buf, j, 0);
588 }
589 }
590 }
591
592 if (key.u.outType) {
593 switch(key.u.outVecSize) {
594 case 3:
595 ADD_CHUNK(store_f32_4);
596 break;
597 case 2:
598 ADD_CHUNK(store_f32_3);
599 break;
600 case 1:
601 ADD_CHUNK(store_f32_2);
602 break;
603 case 0:
604 ADD_CHUNK(store_f32_1);
605 break;
606 }
607 } else {
608 switch(key.u.outVecSize) {
609 case 3:
610 case 2:
611 ADD_CHUNK(store_f32u_4);
612 break;
613 case 1:
614 ADD_CHUNK(store_f32u_2);
615 break;
616 case 0:
617 ADD_CHUNK(store_f32u_1);
618 break;
619 }
620 }
621
622
623 } else {
624 // Add the function prefix
625 // Store the address for the loop return
626 ADD_CHUNK(prefix_i);
627 buf2 = buf;
628
629 // Load the incoming r,g,b,a as needed
630 switch(key.u.inVecSize) {
631 case 3:
632 ADD_CHUNK(load_u8_4);
633 if (key.u.copyAlpha) {
634 ADD_CHUNK(unpack_u8_3);
635 } else {
636 ADD_CHUNK(unpack_u8_4);
637 }
638 break;
639 case 2:
640 ADD_CHUNK(load_u8_3);
641 ADD_CHUNK(unpack_u8_3);
642 break;
643 case 1:
644 ADD_CHUNK(load_u8_2);
645 ADD_CHUNK(unpack_u8_2);
646 break;
647 case 0:
648 ADD_CHUNK(load_u8_1);
649 ADD_CHUNK(unpack_u8_1);
650 break;
651 }
652
653 // Add multiply and accumulate
654 // use MULL to init the output register,
655 // use MLAL from there
656 for (int i=0; i < 4; i++) {
657 for (int j=0; j < 4; j++) {
658 switch(ops[i][j]) {
659 case 0:
660 break;
661 case 2:
662 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
663 break;
664 case 3:
665 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
666 break;
667 }
668 }
669 }
670 for (int j=0; j < 4; j++) {
671 if (opInit[j]) {
672 if (key.u.addMask & (1 << j)) {
673 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
674 }
675 } else {
676 if (key.u.addMask & (1 << j)) {
677 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
678 }
679 }
680 }
681
682 // If we have a dot product, perform the special pack.
683 if (key.u.dot) {
684 ADD_CHUNK(pack_u8_1);
685 ADD_CHUNK(dot);
686 } else {
687 switch(key.u.outVecSize) {
688 case 3:
689 if (key.u.copyAlpha) {
690 ADD_CHUNK(pack_u8_3);
691 } else {
692 ADD_CHUNK(pack_u8_4);
693 }
694 break;
695 case 2:
696 ADD_CHUNK(pack_u8_3);
697 break;
698 case 1:
699 ADD_CHUNK(pack_u8_2);
700 break;
701 case 0:
702 ADD_CHUNK(pack_u8_1);
703 break;
704 }
705 }
706
707 // Write out result
708 switch(key.u.outVecSize) {
709 case 3:
710 case 2:
711 ADD_CHUNK(store_u8_4);
712 break;
713 case 1:
714 ADD_CHUNK(store_u8_2);
715 break;
716 case 0:
717 ADD_CHUNK(store_u8_1);
718 break;
719 }
720 }
721
722 if (key.u.inType != key.u.outType) {
723 key.u.copyAlpha = 0;
724 key.u.dot = 0;
725 }
726
727 // Loop, branch, and cleanup
728 ADD_CHUNK(postfix1);
729 buf = addBranch(buf, buf2, 0x01);
730 ADD_CHUNK(postfix2);
731
732 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
733 if (ret == -1) {
734 ALOGE("mprotect error %i", ret);
735 return false;
736 }
737
738 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
739 return true;
740 #else
741 return false;
742 #endif
743 }
744
updateCoeffCache(float fpMul,float addMul)745 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
746 for(int ct=0; ct < 16; ct++) {
747 ip[ct] = (int16_t)(fp[ct] * 256.f + 0.5f);
748 tmpFp[ct] = fp[ct] * fpMul;
749 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
750 }
751
752 float add = 0.f;
753 if (fpMul > 254.f) add = 0.5f;
754 for(int ct=0; ct < 4; ct++) {
755 tmpFpa[ct] = fpa[ct] * addMul + add;
756 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
757 }
758
759 for(int ct=0; ct < 4; ct++) {
760 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
761 }
762 }
763
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)764 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
765 size_t dataLength) {
766 switch(slot) {
767 case 0:
768 memcpy (fp, data, sizeof(fp));
769 break;
770 case 1:
771 memcpy (fpa, data, sizeof(fpa));
772 break;
773 default:
774 rsAssert(0);
775 break;
776 }
777 mRootPtr = &kernel;
778 }
779
780
One(const RsExpandKernelDriverInfo * info,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)781 static void One(const RsExpandKernelDriverInfo *info, void *out,
782 const void *py, const float* coeff, const float *add,
783 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
784
785 float4 f = 0.f;
786 if (fin) {
787 switch(vsin) {
788 case 3:
789 f = ((const float4 *)py)[0];
790 break;
791 case 2:
792 f = ((const float4 *)py)[0];
793 f.w = 0.f;
794 break;
795 case 1:
796 f.xy = ((const float2 *)py)[0];
797 break;
798 case 0:
799 f.x = ((const float *)py)[0];
800 break;
801 }
802 } else {
803 switch(vsin) {
804 case 3:
805 f = convert_float4(((const uchar4 *)py)[0]);
806 break;
807 case 2:
808 f = convert_float4(((const uchar4 *)py)[0]);
809 f.w = 0.f;
810 break;
811 case 1:
812 f.xy = convert_float2(((const uchar2 *)py)[0]);
813 break;
814 case 0:
815 f.x = (float)(((const uchar *)py)[0]);
816 break;
817 }
818 }
819 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
820
821 float4 sum;
822 sum.x = f.x * coeff[0] +
823 f.y * coeff[4] +
824 f.z * coeff[8] +
825 f.w * coeff[12];
826 sum.y = f.x * coeff[1] +
827 f.y * coeff[5] +
828 f.z * coeff[9] +
829 f.w * coeff[13];
830 sum.z = f.x * coeff[2] +
831 f.y * coeff[6] +
832 f.z * coeff[10] +
833 f.w * coeff[14];
834 sum.w = f.x * coeff[3] +
835 f.y * coeff[7] +
836 f.z * coeff[11] +
837 f.w * coeff[15];
838 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
839
840 sum.x += add[0];
841 sum.y += add[1];
842 sum.z += add[2];
843 sum.w += add[3];
844
845
846 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
847 if (fout) {
848 switch(vsout) {
849 case 3:
850 case 2:
851 ((float4 *)out)[0] = sum;
852 break;
853 case 1:
854 ((float2 *)out)[0] = sum.xy;
855 break;
856 case 0:
857 ((float *)out)[0] = sum.x;
858 break;
859 }
860 } else {
861 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
862 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
863 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
864 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
865
866 switch(vsout) {
867 case 3:
868 case 2:
869 ((uchar4 *)out)[0] = convert_uchar4(sum);
870 break;
871 case 1:
872 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
873 break;
874 case 0:
875 ((uchar *)out)[0] = sum.x;
876 break;
877 }
878 }
879 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
880 }
881
kernel(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)882 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
883 uint32_t xstart, uint32_t xend,
884 uint32_t outstep) {
885 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
886
887 uint32_t instep = info->inStride[0];
888
889 uchar *out = (uchar *)info->outPtr[0];
890 uchar *in = (uchar *)info->inPtr[0];
891 uint32_t x1 = xstart;
892 uint32_t x2 = xend;
893
894 uint32_t vsin = cp->mLastKey.u.inVecSize;
895 uint32_t vsout = cp->mLastKey.u.outVecSize;
896 bool floatIn = !!cp->mLastKey.u.inType;
897 bool floatOut = !!cp->mLastKey.u.outType;
898
899 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
900
901 if(x2 > x1) {
902 int32_t len = x2 - x1;
903 if (gArchUseSIMD) {
904 if((cp->mOptKernel != nullptr) && (len >= 4)) {
905 // The optimized kernel processes 4 pixels at once
906 // and requires a minimum of 1 chunk of 4
907 cp->mOptKernel(out, in, cp->ip, len >> 2);
908 // Update the len and pointers so the generic code can
909 // finish any leftover pixels
910 len &= ~3;
911 x1 += len;
912 out += outstep * len;
913 in += instep * len;
914 }
915 #if defined(ARCH_ARM64_USE_INTRINSICS)
916 else {
917 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
918 // Currently this generates off by one errors.
919 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
920 //x1 += len;
921 //out += outstep * len;
922 //in += instep * len;
923 } else {
924 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
925 x1 += len;
926 out += outstep * len;
927 in += instep * len;
928 }
929 }
930 #endif
931 }
932
933 while(x1 != x2) {
934 One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
935 out += outstep;
936 in += instep;
937 x1++;
938 }
939 }
940 }
941
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)942 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
943 const Allocation ** ains,
944 uint32_t inLen,
945 Allocation * aout,
946 const void * usr,
947 uint32_t usrLen,
948 const RsScriptCall *sc) {
949
950 const Element *ein = ains[0]->mHal.state.type->getElement();
951 const Element *eout = aout->mHal.state.type->getElement();
952
953 if (ein->getType() == eout->getType()) {
954 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955 updateCoeffCache(1.f, 255.f);
956 } else {
957 updateCoeffCache(1.f, 1.f);
958 }
959 } else {
960 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
961 updateCoeffCache(255.f, 255.f);
962 } else {
963 updateCoeffCache(1.f / 255.f, 1.f);
964 }
965 }
966
967 Key_t key = computeKey(ein, eout);
968
969 #if defined(ARCH_X86_HAVE_SSSE3)
970 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
971 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
972 // mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
973 mLastKey = key;
974 }
975
976 #else //if !defined(ARCH_X86_HAVE_SSSE3)
977 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
978 if (mBuf) munmap(mBuf, mBufSize);
979 mBuf = nullptr;
980 mOptKernel = nullptr;
981 if (build(key)) {
982 mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
983 }
984 #if defined(ARCH_ARM64_USE_INTRINSICS)
985 else {
986 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
987 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
988 uint32_t mm = 0;
989 int i;
990 for (i = 0; i < 4; i++)
991 {
992 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
993 m = ((m * 0x249) >> 9) & 15;
994 m |= ((key.u.addMask >> i) & 1) << 4;
995 mm |= m << (i * 5);
996 }
997
998 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
999 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1000 } else {
1001 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1002 }
1003 }
1004 #endif
1005 mLastKey = key;
1006 }
1007 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1008 }
1009
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013
1014 mLastKey.key = 0;
1015 mBuf = nullptr;
1016 mBufSize = 0;
1017 mOptKernel = nullptr;
1018 const static float defaultMatrix[] = {
1019 1.f, 0.f, 0.f, 0.f,
1020 0.f, 1.f, 0.f, 0.f,
1021 0.f, 0.f, 1.f, 0.f,
1022 0.f, 0.f, 0.f, 1.f
1023 };
1024 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027 }
1028
~RsdCpuScriptIntrinsicColorMatrix()1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030 if (mBuf) munmap(mBuf, mBufSize);
1031 mBuf = nullptr;
1032 mOptKernel = nullptr;
1033 }
1034
populateScript(Script * s)1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036 s->mHal.info.exportedVariableCount = 2;
1037 }
1038
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040 const Script *s, const Element *e) {
1041
1042 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043 }
1044
1045 } // namespace renderscript
1046 } // namespace android
1047