1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20
21 namespace android {
22 namespace renderscript {
23
24
25 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
26 public:
27 void populateScript(Script *) override;
28 void invokeFreeChildren() override;
29
30 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
31 void setGlobalObj(uint32_t slot, ObjectBase *data) override;
32
33 ~RsdCpuScriptIntrinsicConvolve5x5() override;
34 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
35
36 protected:
37 float mFp[28];
38 int16_t mIp[28];
39 ObjectBaseRef<Allocation> alloc;
40
41
42 static void kernelU1(const RsExpandKernelDriverInfo *info,
43 uint32_t xstart, uint32_t xend,
44 uint32_t outstep);
45 static void kernelU2(const RsExpandKernelDriverInfo *info,
46 uint32_t xstart, uint32_t xend,
47 uint32_t outstep);
48 static void kernelU4(const RsExpandKernelDriverInfo *info,
49 uint32_t xstart, uint32_t xend,
50 uint32_t outstep);
51 static void kernelF1(const RsExpandKernelDriverInfo *info,
52 uint32_t xstart, uint32_t xend,
53 uint32_t outstep);
54 static void kernelF2(const RsExpandKernelDriverInfo *info,
55 uint32_t xstart, uint32_t xend,
56 uint32_t outstep);
57 static void kernelF4(const RsExpandKernelDriverInfo *info,
58 uint32_t xstart, uint32_t xend,
59 uint32_t outstep);
60
61
62 };
63
setGlobalObj(uint32_t slot,ObjectBase * data)64 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
65 rsAssert(slot == 1);
66 alloc.set(static_cast<Allocation *>(data));
67 }
68
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)69 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
70 const void *data, size_t dataLength) {
71 rsAssert(slot == 0);
72 memcpy (&mFp, data, dataLength);
73 for(int ct=0; ct < 25; ct++) {
74 if (mFp[ct] >= 0) {
75 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
76 } else {
77 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
78 }
79 }
80 }
81
82
OneU4(const RsExpandKernelDriverInfo * info,uint32_t x,uchar4 * out,const uchar4 * py0,const uchar4 * py1,const uchar4 * py2,const uchar4 * py3,const uchar4 * py4,const float * coeff)83 static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
84 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
85 const float* coeff) {
86
87 uint32_t x0 = rsMax((int32_t)x-2, 0);
88 uint32_t x1 = rsMax((int32_t)x-1, 0);
89 uint32_t x2 = x;
90 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
91 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
92
93 float4 px = convert_float4(py0[x0]) * coeff[0] +
94 convert_float4(py0[x1]) * coeff[1] +
95 convert_float4(py0[x2]) * coeff[2] +
96 convert_float4(py0[x3]) * coeff[3] +
97 convert_float4(py0[x4]) * coeff[4] +
98
99 convert_float4(py1[x0]) * coeff[5] +
100 convert_float4(py1[x1]) * coeff[6] +
101 convert_float4(py1[x2]) * coeff[7] +
102 convert_float4(py1[x3]) * coeff[8] +
103 convert_float4(py1[x4]) * coeff[9] +
104
105 convert_float4(py2[x0]) * coeff[10] +
106 convert_float4(py2[x1]) * coeff[11] +
107 convert_float4(py2[x2]) * coeff[12] +
108 convert_float4(py2[x3]) * coeff[13] +
109 convert_float4(py2[x4]) * coeff[14] +
110
111 convert_float4(py3[x0]) * coeff[15] +
112 convert_float4(py3[x1]) * coeff[16] +
113 convert_float4(py3[x2]) * coeff[17] +
114 convert_float4(py3[x3]) * coeff[18] +
115 convert_float4(py3[x4]) * coeff[19] +
116
117 convert_float4(py4[x0]) * coeff[20] +
118 convert_float4(py4[x1]) * coeff[21] +
119 convert_float4(py4[x2]) * coeff[22] +
120 convert_float4(py4[x3]) * coeff[23] +
121 convert_float4(py4[x4]) * coeff[24];
122 px = clamp(px + 0.5f, 0.f, 255.f);
123 *out = convert_uchar4(px);
124 }
125
OneU2(const RsExpandKernelDriverInfo * info,uint32_t x,uchar2 * out,const uchar2 * py0,const uchar2 * py1,const uchar2 * py2,const uchar2 * py3,const uchar2 * py4,const float * coeff)126 static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
127 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
128 const float* coeff) {
129
130 uint32_t x0 = rsMax((int32_t)x-2, 0);
131 uint32_t x1 = rsMax((int32_t)x-1, 0);
132 uint32_t x2 = x;
133 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
134 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
135
136 float2 px = convert_float2(py0[x0]) * coeff[0] +
137 convert_float2(py0[x1]) * coeff[1] +
138 convert_float2(py0[x2]) * coeff[2] +
139 convert_float2(py0[x3]) * coeff[3] +
140 convert_float2(py0[x4]) * coeff[4] +
141
142 convert_float2(py1[x0]) * coeff[5] +
143 convert_float2(py1[x1]) * coeff[6] +
144 convert_float2(py1[x2]) * coeff[7] +
145 convert_float2(py1[x3]) * coeff[8] +
146 convert_float2(py1[x4]) * coeff[9] +
147
148 convert_float2(py2[x0]) * coeff[10] +
149 convert_float2(py2[x1]) * coeff[11] +
150 convert_float2(py2[x2]) * coeff[12] +
151 convert_float2(py2[x3]) * coeff[13] +
152 convert_float2(py2[x4]) * coeff[14] +
153
154 convert_float2(py3[x0]) * coeff[15] +
155 convert_float2(py3[x1]) * coeff[16] +
156 convert_float2(py3[x2]) * coeff[17] +
157 convert_float2(py3[x3]) * coeff[18] +
158 convert_float2(py3[x4]) * coeff[19] +
159
160 convert_float2(py4[x0]) * coeff[20] +
161 convert_float2(py4[x1]) * coeff[21] +
162 convert_float2(py4[x2]) * coeff[22] +
163 convert_float2(py4[x3]) * coeff[23] +
164 convert_float2(py4[x4]) * coeff[24];
165 px = clamp(px + 0.5f, 0.f, 255.f);
166 *out = convert_uchar2(px);
167 }
168
OneU1(const RsExpandKernelDriverInfo * info,uint32_t x,uchar * out,const uchar * py0,const uchar * py1,const uchar * py2,const uchar * py3,const uchar * py4,const float * coeff)169 static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
170 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
171 const float* coeff) {
172
173 uint32_t x0 = rsMax((int32_t)x-2, 0);
174 uint32_t x1 = rsMax((int32_t)x-1, 0);
175 uint32_t x2 = x;
176 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
177 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
178
179 float px = (float)(py0[x0]) * coeff[0] +
180 (float)(py0[x1]) * coeff[1] +
181 (float)(py0[x2]) * coeff[2] +
182 (float)(py0[x3]) * coeff[3] +
183 (float)(py0[x4]) * coeff[4] +
184
185 (float)(py1[x0]) * coeff[5] +
186 (float)(py1[x1]) * coeff[6] +
187 (float)(py1[x2]) * coeff[7] +
188 (float)(py1[x3]) * coeff[8] +
189 (float)(py1[x4]) * coeff[9] +
190
191 (float)(py2[x0]) * coeff[10] +
192 (float)(py2[x1]) * coeff[11] +
193 (float)(py2[x2]) * coeff[12] +
194 (float)(py2[x3]) * coeff[13] +
195 (float)(py2[x4]) * coeff[14] +
196
197 (float)(py3[x0]) * coeff[15] +
198 (float)(py3[x1]) * coeff[16] +
199 (float)(py3[x2]) * coeff[17] +
200 (float)(py3[x3]) * coeff[18] +
201 (float)(py3[x4]) * coeff[19] +
202
203 (float)(py4[x0]) * coeff[20] +
204 (float)(py4[x1]) * coeff[21] +
205 (float)(py4[x2]) * coeff[22] +
206 (float)(py4[x3]) * coeff[23] +
207 (float)(py4[x4]) * coeff[24];
208 px = clamp(px + 0.5f, 0.f, 255.f);
209 *out = px;
210 }
211
OneF4(const RsExpandKernelDriverInfo * info,uint32_t x,float4 * out,const float4 * py0,const float4 * py1,const float4 * py2,const float4 * py3,const float4 * py4,const float * coeff)212 static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
213 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
214 const float* coeff) {
215
216 uint32_t x0 = rsMax((int32_t)x-2, 0);
217 uint32_t x1 = rsMax((int32_t)x-1, 0);
218 uint32_t x2 = x;
219 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
220 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
221
222 float4 px = py0[x0] * coeff[0] +
223 py0[x1] * coeff[1] +
224 py0[x2] * coeff[2] +
225 py0[x3] * coeff[3] +
226 py0[x4] * coeff[4] +
227
228 py1[x0] * coeff[5] +
229 py1[x1] * coeff[6] +
230 py1[x2] * coeff[7] +
231 py1[x3] * coeff[8] +
232 py1[x4] * coeff[9] +
233
234 py2[x0] * coeff[10] +
235 py2[x1] * coeff[11] +
236 py2[x2] * coeff[12] +
237 py2[x3] * coeff[13] +
238 py2[x4] * coeff[14] +
239
240 py3[x0] * coeff[15] +
241 py3[x1] * coeff[16] +
242 py3[x2] * coeff[17] +
243 py3[x3] * coeff[18] +
244 py3[x4] * coeff[19] +
245
246 py4[x0] * coeff[20] +
247 py4[x1] * coeff[21] +
248 py4[x2] * coeff[22] +
249 py4[x3] * coeff[23] +
250 py4[x4] * coeff[24];
251 *out = px;
252 }
253
OneF2(const RsExpandKernelDriverInfo * info,uint32_t x,float2 * out,const float2 * py0,const float2 * py1,const float2 * py2,const float2 * py3,const float2 * py4,const float * coeff)254 static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
255 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
256 const float* coeff) {
257
258 uint32_t x0 = rsMax((int32_t)x-2, 0);
259 uint32_t x1 = rsMax((int32_t)x-1, 0);
260 uint32_t x2 = x;
261 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
262 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
263
264 float2 px = py0[x0] * coeff[0] +
265 py0[x1] * coeff[1] +
266 py0[x2] * coeff[2] +
267 py0[x3] * coeff[3] +
268 py0[x4] * coeff[4] +
269
270 py1[x0] * coeff[5] +
271 py1[x1] * coeff[6] +
272 py1[x2] * coeff[7] +
273 py1[x3] * coeff[8] +
274 py1[x4] * coeff[9] +
275
276 py2[x0] * coeff[10] +
277 py2[x1] * coeff[11] +
278 py2[x2] * coeff[12] +
279 py2[x3] * coeff[13] +
280 py2[x4] * coeff[14] +
281
282 py3[x0] * coeff[15] +
283 py3[x1] * coeff[16] +
284 py3[x2] * coeff[17] +
285 py3[x3] * coeff[18] +
286 py3[x4] * coeff[19] +
287
288 py4[x0] * coeff[20] +
289 py4[x1] * coeff[21] +
290 py4[x2] * coeff[22] +
291 py4[x3] * coeff[23] +
292 py4[x4] * coeff[24];
293 *out = px;
294 }
295
OneF1(const RsExpandKernelDriverInfo * info,uint32_t x,float * out,const float * py0,const float * py1,const float * py2,const float * py3,const float * py4,const float * coeff)296 static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
297 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
298 const float* coeff) {
299
300 uint32_t x0 = rsMax((int32_t)x-2, 0);
301 uint32_t x1 = rsMax((int32_t)x-1, 0);
302 uint32_t x2 = x;
303 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
304 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
305
306 float px = py0[x0] * coeff[0] +
307 py0[x1] * coeff[1] +
308 py0[x2] * coeff[2] +
309 py0[x3] * coeff[3] +
310 py0[x4] * coeff[4] +
311
312 py1[x0] * coeff[5] +
313 py1[x1] * coeff[6] +
314 py1[x2] * coeff[7] +
315 py1[x3] * coeff[8] +
316 py1[x4] * coeff[9] +
317
318 py2[x0] * coeff[10] +
319 py2[x1] * coeff[11] +
320 py2[x2] * coeff[12] +
321 py2[x3] * coeff[13] +
322 py2[x4] * coeff[14] +
323
324 py3[x0] * coeff[15] +
325 py3[x1] * coeff[16] +
326 py3[x2] * coeff[17] +
327 py3[x3] * coeff[18] +
328 py3[x4] * coeff[19] +
329
330 py4[x0] * coeff[20] +
331 py4[x1] * coeff[21] +
332 py4[x2] * coeff[22] +
333 py4[x3] * coeff[23] +
334 py4[x4] * coeff[24];
335 *out = px;
336 }
337
338
339 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
340 const void *y2, const void *y3, const void *y4,
341 const int16_t *coef, uint32_t count);
342
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)343 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
344 uint32_t xstart, uint32_t xend,
345 uint32_t outstep) {
346 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
347 if (!cp->alloc.get()) {
348 ALOGE("Convolve5x5 executed without input, skipping");
349 return;
350 }
351 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
352 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
353
354 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
355 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
356 uint32_t y2 = info->current.y;
357 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
358 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
359
360 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
361 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
362 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
363 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
364 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
365
366 uchar4 *out = (uchar4 *)info->outPtr[0];
367 uint32_t x1 = xstart;
368 uint32_t x2 = xend;
369
370 while((x1 < x2) && (x1 < 2)) {
371 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
372 out++;
373 x1++;
374 }
375 #if defined(ARCH_X86_HAVE_SSSE3)
376 // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
377 // 3 for end boundary where x may hit the end boundary)
378 if (gArchUseSIMD &&((x1 + 6) < x2)) {
379 // subtract 3 for end boundary
380 uint32_t len = (x2 - x1 - 3) >> 2;
381 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
382 out += len << 2;
383 x1 += len << 2;
384 }
385 #endif
386
387 #if defined(ARCH_ARM_USE_INTRINSICS)
388 if(gArchUseSIMD && ((x1 + 3) < x2)) {
389 uint32_t len = (x2 - x1 - 3) >> 1;
390 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
391 out += len << 1;
392 x1 += len << 1;
393 }
394 #endif
395
396 while(x1 < x2) {
397 OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
398 out++;
399 x1++;
400 }
401 }
402
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)403 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
404 uint32_t xstart, uint32_t xend,
405 uint32_t outstep) {
406 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
407 if (!cp->alloc.get()) {
408 ALOGE("Convolve5x5 executed without input, skipping");
409 return;
410 }
411 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
412 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
413
414 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
415 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
416 uint32_t y2 = info->current.y;
417 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
418 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
419
420 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
421 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
422 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
423 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
424 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
425
426 uchar2 *out = (uchar2 *)info->outPtr[0];
427 uint32_t x1 = xstart;
428 uint32_t x2 = xend;
429
430 while((x1 < x2) && (x1 < 2)) {
431 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
432 out++;
433 x1++;
434 }
435
436 #if 0//defined(ARCH_ARM_HAVE_NEON)
437 if((x1 + 3) < x2) {
438 uint32_t len = (x2 - x1 - 3) >> 1;
439 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
440 out += len << 1;
441 x1 += len << 1;
442 }
443 #endif
444
445 while(x1 < x2) {
446 OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
447 out++;
448 x1++;
449 }
450 }
451
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)452 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
453 uint32_t xstart, uint32_t xend,
454 uint32_t outstep) {
455 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
456 if (!cp->alloc.get()) {
457 ALOGE("Convolve5x5 executed without input, skipping");
458 return;
459 }
460 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
461 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
462
463 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
464 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
465 uint32_t y2 = info->current.y;
466 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
467 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
468
469 const uchar *py0 = (const uchar *)(pin + stride * y0);
470 const uchar *py1 = (const uchar *)(pin + stride * y1);
471 const uchar *py2 = (const uchar *)(pin + stride * y2);
472 const uchar *py3 = (const uchar *)(pin + stride * y3);
473 const uchar *py4 = (const uchar *)(pin + stride * y4);
474
475 uchar *out = (uchar *)info->outPtr[0];
476 uint32_t x1 = xstart;
477 uint32_t x2 = xend;
478
479 while((x1 < x2) && (x1 < 2)) {
480 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
481 out++;
482 x1++;
483 }
484
485 #if 0//defined(ARCH_ARM_HAVE_NEON)
486 if((x1 + 3) < x2) {
487 uint32_t len = (x2 - x1 - 3) >> 1;
488 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
489 out += len << 1;
490 x1 += len << 1;
491 }
492 #endif
493
494 while(x1 < x2) {
495 OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
496 out++;
497 x1++;
498 }
499 }
500
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)501 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
502 uint32_t xstart, uint32_t xend,
503 uint32_t outstep) {
504 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
505 if (!cp->alloc.get()) {
506 ALOGE("Convolve5x5 executed without input, skipping");
507 return;
508 }
509 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
510 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
511
512 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
513 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
514 uint32_t y2 = info->current.y;
515 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
516 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
517
518 const float4 *py0 = (const float4 *)(pin + stride * y0);
519 const float4 *py1 = (const float4 *)(pin + stride * y1);
520 const float4 *py2 = (const float4 *)(pin + stride * y2);
521 const float4 *py3 = (const float4 *)(pin + stride * y3);
522 const float4 *py4 = (const float4 *)(pin + stride * y4);
523
524 float4 *out = (float4 *)info->outPtr[0];
525 uint32_t x1 = xstart;
526 uint32_t x2 = xend;
527
528 while((x1 < x2) && (x1 < 2)) {
529 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
530 out++;
531 x1++;
532 }
533
534 #if 0//defined(ARCH_ARM_HAVE_NEON)
535 if((x1 + 3) < x2) {
536 uint32_t len = (x2 - x1 - 3) >> 1;
537 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
538 out += len << 1;
539 x1 += len << 1;
540 }
541 #endif
542
543 while(x1 < x2) {
544 OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
545 out++;
546 x1++;
547 }
548 }
549
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)550 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
551 uint32_t xstart, uint32_t xend,
552 uint32_t outstep) {
553 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
554 if (!cp->alloc.get()) {
555 ALOGE("Convolve5x5 executed without input, skipping");
556 return;
557 }
558 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
559 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
560
561 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
562 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
563 uint32_t y2 = info->current.y;
564 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
565 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
566
567 const float2 *py0 = (const float2 *)(pin + stride * y0);
568 const float2 *py1 = (const float2 *)(pin + stride * y1);
569 const float2 *py2 = (const float2 *)(pin + stride * y2);
570 const float2 *py3 = (const float2 *)(pin + stride * y3);
571 const float2 *py4 = (const float2 *)(pin + stride * y4);
572
573 float2 *out = (float2 *)info->outPtr[0];
574 uint32_t x1 = xstart;
575 uint32_t x2 = xend;
576
577 while((x1 < x2) && (x1 < 2)) {
578 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
579 out++;
580 x1++;
581 }
582
583 #if 0//defined(ARCH_ARM_HAVE_NEON)
584 if((x1 + 3) < x2) {
585 uint32_t len = (x2 - x1 - 3) >> 1;
586 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
587 out += len << 1;
588 x1 += len << 1;
589 }
590 #endif
591
592 while(x1 < x2) {
593 OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
594 out++;
595 x1++;
596 }
597 }
598
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)599 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
600 uint32_t xstart, uint32_t xend,
601 uint32_t outstep) {
602 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
603 if (!cp->alloc.get()) {
604 ALOGE("Convolve5x5 executed without input, skipping");
605 return;
606 }
607 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
608 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
609
610 uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
611 uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
612 uint32_t y2 = info->current.y;
613 uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
614 uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
615
616 const float *py0 = (const float *)(pin + stride * y0);
617 const float *py1 = (const float *)(pin + stride * y1);
618 const float *py2 = (const float *)(pin + stride * y2);
619 const float *py3 = (const float *)(pin + stride * y3);
620 const float *py4 = (const float *)(pin + stride * y4);
621
622 float *out = (float *)info->outPtr[0];
623 uint32_t x1 = xstart;
624 uint32_t x2 = xend;
625
626 while((x1 < x2) && (x1 < 2)) {
627 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
628 out++;
629 x1++;
630 }
631
632 #if 0//defined(ARCH_ARM_HAVE_NEON)
633 if((x1 + 3) < x2) {
634 uint32_t len = (x2 - x1 - 3) >> 1;
635 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
636 out += len << 1;
637 x1 += len << 1;
638 }
639 #endif
640
641 while(x1 < x2) {
642 OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
643 out++;
644 x1++;
645 }
646 }
647
RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)648 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
649 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
650 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
651
652 if (e->getType() == RS_TYPE_FLOAT_32) {
653 switch(e->getVectorSize()) {
654 case 1:
655 mRootPtr = &kernelF1;
656 break;
657 case 2:
658 mRootPtr = &kernelF2;
659 break;
660 case 3:
661 case 4:
662 mRootPtr = &kernelF4;
663 break;
664 }
665 } else {
666 switch(e->getVectorSize()) {
667 case 1:
668 mRootPtr = &kernelU1;
669 break;
670 case 2:
671 mRootPtr = &kernelU2;
672 break;
673 case 3:
674 case 4:
675 mRootPtr = &kernelU4;
676 break;
677 }
678 }
679 for(int ct=0; ct < 25; ct++) {
680 mFp[ct] = 1.f / 25.f;
681 mIp[ct] = (int16_t)(mFp[ct] * 256.f);
682 }
683 }
684
~RsdCpuScriptIntrinsicConvolve5x5()685 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
686 }
687
populateScript(Script * s)688 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
689 s->mHal.info.exportedVariableCount = 2;
690 }
691
invokeFreeChildren()692 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
693 alloc.clear();
694 }
695
rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)696 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
697 const Script *s, const Element *e) {
698
699 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
700 }
701
702 } // namespace renderscript
703 } // namespace android
704