1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20 
21 namespace android {
22 namespace renderscript {
23 
24 
25 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
26 public:
27     void populateScript(Script *) override;
28     void invokeFreeChildren() override;
29 
30     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
31     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
32 
33     ~RsdCpuScriptIntrinsicConvolve5x5() override;
34     RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
35 
36 protected:
37     float mFp[28];
38     int16_t mIp[28];
39     ObjectBaseRef<Allocation> alloc;
40 
41 
42     static void kernelU1(const RsExpandKernelDriverInfo *info,
43                          uint32_t xstart, uint32_t xend,
44                          uint32_t outstep);
45     static void kernelU2(const RsExpandKernelDriverInfo *info,
46                          uint32_t xstart, uint32_t xend,
47                          uint32_t outstep);
48     static void kernelU4(const RsExpandKernelDriverInfo *info,
49                          uint32_t xstart, uint32_t xend,
50                          uint32_t outstep);
51     static void kernelF1(const RsExpandKernelDriverInfo *info,
52                          uint32_t xstart, uint32_t xend,
53                          uint32_t outstep);
54     static void kernelF2(const RsExpandKernelDriverInfo *info,
55                          uint32_t xstart, uint32_t xend,
56                          uint32_t outstep);
57     static void kernelF4(const RsExpandKernelDriverInfo *info,
58                          uint32_t xstart, uint32_t xend,
59                          uint32_t outstep);
60 
61 
62 };
63 
setGlobalObj(uint32_t slot,ObjectBase * data)64 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
65     rsAssert(slot == 1);
66     alloc.set(static_cast<Allocation *>(data));
67 }
68 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)69 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
70                                                     const void *data, size_t dataLength) {
71     rsAssert(slot == 0);
72     memcpy (&mFp, data, dataLength);
73     for(int ct=0; ct < 25; ct++) {
74         if (mFp[ct] >= 0) {
75             mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
76         } else {
77             mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
78         }
79     }
80 }
81 
82 
OneU4(const RsExpandKernelDriverInfo * info,uint32_t x,uchar4 * out,const uchar4 * py0,const uchar4 * py1,const uchar4 * py2,const uchar4 * py3,const uchar4 * py4,const float * coeff)83 static void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
84                   const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
85                   const float* coeff) {
86 
87     uint32_t x0 = rsMax((int32_t)x-2, 0);
88     uint32_t x1 = rsMax((int32_t)x-1, 0);
89     uint32_t x2 = x;
90     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
91     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
92 
93     float4 px = convert_float4(py0[x0]) * coeff[0] +
94                 convert_float4(py0[x1]) * coeff[1] +
95                 convert_float4(py0[x2]) * coeff[2] +
96                 convert_float4(py0[x3]) * coeff[3] +
97                 convert_float4(py0[x4]) * coeff[4] +
98 
99                 convert_float4(py1[x0]) * coeff[5] +
100                 convert_float4(py1[x1]) * coeff[6] +
101                 convert_float4(py1[x2]) * coeff[7] +
102                 convert_float4(py1[x3]) * coeff[8] +
103                 convert_float4(py1[x4]) * coeff[9] +
104 
105                 convert_float4(py2[x0]) * coeff[10] +
106                 convert_float4(py2[x1]) * coeff[11] +
107                 convert_float4(py2[x2]) * coeff[12] +
108                 convert_float4(py2[x3]) * coeff[13] +
109                 convert_float4(py2[x4]) * coeff[14] +
110 
111                 convert_float4(py3[x0]) * coeff[15] +
112                 convert_float4(py3[x1]) * coeff[16] +
113                 convert_float4(py3[x2]) * coeff[17] +
114                 convert_float4(py3[x3]) * coeff[18] +
115                 convert_float4(py3[x4]) * coeff[19] +
116 
117                 convert_float4(py4[x0]) * coeff[20] +
118                 convert_float4(py4[x1]) * coeff[21] +
119                 convert_float4(py4[x2]) * coeff[22] +
120                 convert_float4(py4[x3]) * coeff[23] +
121                 convert_float4(py4[x4]) * coeff[24];
122     px = clamp(px + 0.5f, 0.f, 255.f);
123     *out = convert_uchar4(px);
124 }
125 
OneU2(const RsExpandKernelDriverInfo * info,uint32_t x,uchar2 * out,const uchar2 * py0,const uchar2 * py1,const uchar2 * py2,const uchar2 * py3,const uchar2 * py4,const float * coeff)126 static void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
127                   const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
128                   const float* coeff) {
129 
130     uint32_t x0 = rsMax((int32_t)x-2, 0);
131     uint32_t x1 = rsMax((int32_t)x-1, 0);
132     uint32_t x2 = x;
133     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
134     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
135 
136     float2 px = convert_float2(py0[x0]) * coeff[0] +
137                 convert_float2(py0[x1]) * coeff[1] +
138                 convert_float2(py0[x2]) * coeff[2] +
139                 convert_float2(py0[x3]) * coeff[3] +
140                 convert_float2(py0[x4]) * coeff[4] +
141 
142                 convert_float2(py1[x0]) * coeff[5] +
143                 convert_float2(py1[x1]) * coeff[6] +
144                 convert_float2(py1[x2]) * coeff[7] +
145                 convert_float2(py1[x3]) * coeff[8] +
146                 convert_float2(py1[x4]) * coeff[9] +
147 
148                 convert_float2(py2[x0]) * coeff[10] +
149                 convert_float2(py2[x1]) * coeff[11] +
150                 convert_float2(py2[x2]) * coeff[12] +
151                 convert_float2(py2[x3]) * coeff[13] +
152                 convert_float2(py2[x4]) * coeff[14] +
153 
154                 convert_float2(py3[x0]) * coeff[15] +
155                 convert_float2(py3[x1]) * coeff[16] +
156                 convert_float2(py3[x2]) * coeff[17] +
157                 convert_float2(py3[x3]) * coeff[18] +
158                 convert_float2(py3[x4]) * coeff[19] +
159 
160                 convert_float2(py4[x0]) * coeff[20] +
161                 convert_float2(py4[x1]) * coeff[21] +
162                 convert_float2(py4[x2]) * coeff[22] +
163                 convert_float2(py4[x3]) * coeff[23] +
164                 convert_float2(py4[x4]) * coeff[24];
165     px = clamp(px + 0.5f, 0.f, 255.f);
166     *out = convert_uchar2(px);
167 }
168 
OneU1(const RsExpandKernelDriverInfo * info,uint32_t x,uchar * out,const uchar * py0,const uchar * py1,const uchar * py2,const uchar * py3,const uchar * py4,const float * coeff)169 static void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
170                   const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
171                   const float* coeff) {
172 
173     uint32_t x0 = rsMax((int32_t)x-2, 0);
174     uint32_t x1 = rsMax((int32_t)x-1, 0);
175     uint32_t x2 = x;
176     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
177     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
178 
179     float px = (float)(py0[x0]) * coeff[0] +
180                (float)(py0[x1]) * coeff[1] +
181                (float)(py0[x2]) * coeff[2] +
182                (float)(py0[x3]) * coeff[3] +
183                (float)(py0[x4]) * coeff[4] +
184 
185                (float)(py1[x0]) * coeff[5] +
186                (float)(py1[x1]) * coeff[6] +
187                (float)(py1[x2]) * coeff[7] +
188                (float)(py1[x3]) * coeff[8] +
189                (float)(py1[x4]) * coeff[9] +
190 
191                (float)(py2[x0]) * coeff[10] +
192                (float)(py2[x1]) * coeff[11] +
193                (float)(py2[x2]) * coeff[12] +
194                (float)(py2[x3]) * coeff[13] +
195                (float)(py2[x4]) * coeff[14] +
196 
197                (float)(py3[x0]) * coeff[15] +
198                (float)(py3[x1]) * coeff[16] +
199                (float)(py3[x2]) * coeff[17] +
200                (float)(py3[x3]) * coeff[18] +
201                (float)(py3[x4]) * coeff[19] +
202 
203                (float)(py4[x0]) * coeff[20] +
204                (float)(py4[x1]) * coeff[21] +
205                (float)(py4[x2]) * coeff[22] +
206                (float)(py4[x3]) * coeff[23] +
207                (float)(py4[x4]) * coeff[24];
208     px = clamp(px + 0.5f, 0.f, 255.f);
209     *out = px;
210 }
211 
OneF4(const RsExpandKernelDriverInfo * info,uint32_t x,float4 * out,const float4 * py0,const float4 * py1,const float4 * py2,const float4 * py3,const float4 * py4,const float * coeff)212 static void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
213                   const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
214                   const float* coeff) {
215 
216     uint32_t x0 = rsMax((int32_t)x-2, 0);
217     uint32_t x1 = rsMax((int32_t)x-1, 0);
218     uint32_t x2 = x;
219     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
220     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
221 
222     float4 px = py0[x0] * coeff[0] +
223                 py0[x1] * coeff[1] +
224                 py0[x2] * coeff[2] +
225                 py0[x3] * coeff[3] +
226                 py0[x4] * coeff[4] +
227 
228                 py1[x0] * coeff[5] +
229                 py1[x1] * coeff[6] +
230                 py1[x2] * coeff[7] +
231                 py1[x3] * coeff[8] +
232                 py1[x4] * coeff[9] +
233 
234                 py2[x0] * coeff[10] +
235                 py2[x1] * coeff[11] +
236                 py2[x2] * coeff[12] +
237                 py2[x3] * coeff[13] +
238                 py2[x4] * coeff[14] +
239 
240                 py3[x0] * coeff[15] +
241                 py3[x1] * coeff[16] +
242                 py3[x2] * coeff[17] +
243                 py3[x3] * coeff[18] +
244                 py3[x4] * coeff[19] +
245 
246                 py4[x0] * coeff[20] +
247                 py4[x1] * coeff[21] +
248                 py4[x2] * coeff[22] +
249                 py4[x3] * coeff[23] +
250                 py4[x4] * coeff[24];
251     *out = px;
252 }
253 
OneF2(const RsExpandKernelDriverInfo * info,uint32_t x,float2 * out,const float2 * py0,const float2 * py1,const float2 * py2,const float2 * py3,const float2 * py4,const float * coeff)254 static void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
255                   const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
256                   const float* coeff) {
257 
258     uint32_t x0 = rsMax((int32_t)x-2, 0);
259     uint32_t x1 = rsMax((int32_t)x-1, 0);
260     uint32_t x2 = x;
261     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
262     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
263 
264     float2 px = py0[x0] * coeff[0] +
265                 py0[x1] * coeff[1] +
266                 py0[x2] * coeff[2] +
267                 py0[x3] * coeff[3] +
268                 py0[x4] * coeff[4] +
269 
270                 py1[x0] * coeff[5] +
271                 py1[x1] * coeff[6] +
272                 py1[x2] * coeff[7] +
273                 py1[x3] * coeff[8] +
274                 py1[x4] * coeff[9] +
275 
276                 py2[x0] * coeff[10] +
277                 py2[x1] * coeff[11] +
278                 py2[x2] * coeff[12] +
279                 py2[x3] * coeff[13] +
280                 py2[x4] * coeff[14] +
281 
282                 py3[x0] * coeff[15] +
283                 py3[x1] * coeff[16] +
284                 py3[x2] * coeff[17] +
285                 py3[x3] * coeff[18] +
286                 py3[x4] * coeff[19] +
287 
288                 py4[x0] * coeff[20] +
289                 py4[x1] * coeff[21] +
290                 py4[x2] * coeff[22] +
291                 py4[x3] * coeff[23] +
292                 py4[x4] * coeff[24];
293     *out = px;
294 }
295 
OneF1(const RsExpandKernelDriverInfo * info,uint32_t x,float * out,const float * py0,const float * py1,const float * py2,const float * py3,const float * py4,const float * coeff)296 static void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
297                   const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
298                   const float* coeff) {
299 
300     uint32_t x0 = rsMax((int32_t)x-2, 0);
301     uint32_t x1 = rsMax((int32_t)x-1, 0);
302     uint32_t x2 = x;
303     uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
304     uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
305 
306     float px = py0[x0] * coeff[0] +
307                py0[x1] * coeff[1] +
308                py0[x2] * coeff[2] +
309                py0[x3] * coeff[3] +
310                py0[x4] * coeff[4] +
311 
312                py1[x0] * coeff[5] +
313                py1[x1] * coeff[6] +
314                py1[x2] * coeff[7] +
315                py1[x3] * coeff[8] +
316                py1[x4] * coeff[9] +
317 
318                py2[x0] * coeff[10] +
319                py2[x1] * coeff[11] +
320                py2[x2] * coeff[12] +
321                py2[x3] * coeff[13] +
322                py2[x4] * coeff[14] +
323 
324                py3[x0] * coeff[15] +
325                py3[x1] * coeff[16] +
326                py3[x2] * coeff[17] +
327                py3[x3] * coeff[18] +
328                py3[x4] * coeff[19] +
329 
330                py4[x0] * coeff[20] +
331                py4[x1] * coeff[21] +
332                py4[x2] * coeff[22] +
333                py4[x3] * coeff[23] +
334                py4[x4] * coeff[24];
335     *out = px;
336 }
337 
338 
339 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
340                                           const void *y2, const void *y3, const void *y4,
341                                           const int16_t *coef, uint32_t count);
342 
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)343 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
344                                                 uint32_t xstart, uint32_t xend,
345                                                 uint32_t outstep) {
346     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
347     if (!cp->alloc.get()) {
348         ALOGE("Convolve5x5 executed without input, skipping");
349         return;
350     }
351     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
352     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
353 
354     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
355     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
356     uint32_t y2 = info->current.y;
357     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
358     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
359 
360     const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
361     const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
362     const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
363     const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
364     const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
365 
366     uchar4 *out = (uchar4 *)info->outPtr[0];
367     uint32_t x1 = xstart;
368     uint32_t x2 = xend;
369 
370     while((x1 < x2) && (x1 < 2)) {
371         OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
372         out++;
373         x1++;
374     }
375 #if defined(ARCH_X86_HAVE_SSSE3)
376     // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
377     // 3 for end boundary where x may hit the end boundary)
378     if (gArchUseSIMD &&((x1 + 6) < x2)) {
379         // subtract 3 for end boundary
380         uint32_t len = (x2 - x1 - 3) >> 2;
381         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
382         out += len << 2;
383         x1 += len << 2;
384     }
385 #endif
386 
387 #if defined(ARCH_ARM_USE_INTRINSICS)
388     if(gArchUseSIMD && ((x1 + 3) < x2)) {
389         uint32_t len = (x2 - x1 - 3) >> 1;
390         rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
391         out += len << 1;
392         x1 += len << 1;
393     }
394 #endif
395 
396     while(x1 < x2) {
397         OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
398         out++;
399         x1++;
400     }
401 }
402 
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)403 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
404                                                 uint32_t xstart, uint32_t xend,
405                                                 uint32_t outstep) {
406     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
407     if (!cp->alloc.get()) {
408         ALOGE("Convolve5x5 executed without input, skipping");
409         return;
410     }
411     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
412     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
413 
414     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
415     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
416     uint32_t y2 = info->current.y;
417     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
418     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
419 
420     const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
421     const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
422     const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
423     const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
424     const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
425 
426     uchar2 *out = (uchar2 *)info->outPtr[0];
427     uint32_t x1 = xstart;
428     uint32_t x2 = xend;
429 
430     while((x1 < x2) && (x1 < 2)) {
431         OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
432         out++;
433         x1++;
434     }
435 
436 #if 0//defined(ARCH_ARM_HAVE_NEON)
437     if((x1 + 3) < x2) {
438         uint32_t len = (x2 - x1 - 3) >> 1;
439         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
440         out += len << 1;
441         x1 += len << 1;
442     }
443 #endif
444 
445     while(x1 < x2) {
446         OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
447         out++;
448         x1++;
449     }
450 }
451 
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)452 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
453                                                 uint32_t xstart, uint32_t xend,
454                                                 uint32_t outstep) {
455     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
456     if (!cp->alloc.get()) {
457         ALOGE("Convolve5x5 executed without input, skipping");
458         return;
459     }
460     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
461     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
462 
463     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
464     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
465     uint32_t y2 = info->current.y;
466     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
467     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
468 
469     const uchar *py0 = (const uchar *)(pin + stride * y0);
470     const uchar *py1 = (const uchar *)(pin + stride * y1);
471     const uchar *py2 = (const uchar *)(pin + stride * y2);
472     const uchar *py3 = (const uchar *)(pin + stride * y3);
473     const uchar *py4 = (const uchar *)(pin + stride * y4);
474 
475     uchar *out = (uchar *)info->outPtr[0];
476     uint32_t x1 = xstart;
477     uint32_t x2 = xend;
478 
479     while((x1 < x2) && (x1 < 2)) {
480         OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
481         out++;
482         x1++;
483     }
484 
485 #if 0//defined(ARCH_ARM_HAVE_NEON)
486     if((x1 + 3) < x2) {
487         uint32_t len = (x2 - x1 - 3) >> 1;
488         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
489         out += len << 1;
490         x1 += len << 1;
491     }
492 #endif
493 
494     while(x1 < x2) {
495         OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
496         out++;
497         x1++;
498     }
499 }
500 
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)501 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
502                                                 uint32_t xstart, uint32_t xend,
503                                                 uint32_t outstep) {
504     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
505     if (!cp->alloc.get()) {
506         ALOGE("Convolve5x5 executed without input, skipping");
507         return;
508     }
509     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
510     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
511 
512     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
513     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
514     uint32_t y2 = info->current.y;
515     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
516     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
517 
518     const float4 *py0 = (const float4 *)(pin + stride * y0);
519     const float4 *py1 = (const float4 *)(pin + stride * y1);
520     const float4 *py2 = (const float4 *)(pin + stride * y2);
521     const float4 *py3 = (const float4 *)(pin + stride * y3);
522     const float4 *py4 = (const float4 *)(pin + stride * y4);
523 
524     float4 *out = (float4 *)info->outPtr[0];
525     uint32_t x1 = xstart;
526     uint32_t x2 = xend;
527 
528     while((x1 < x2) && (x1 < 2)) {
529         OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
530         out++;
531         x1++;
532     }
533 
534 #if 0//defined(ARCH_ARM_HAVE_NEON)
535     if((x1 + 3) < x2) {
536         uint32_t len = (x2 - x1 - 3) >> 1;
537         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
538         out += len << 1;
539         x1 += len << 1;
540     }
541 #endif
542 
543     while(x1 < x2) {
544         OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
545         out++;
546         x1++;
547     }
548 }
549 
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)550 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
551                                                 uint32_t xstart, uint32_t xend,
552                                                 uint32_t outstep) {
553     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
554     if (!cp->alloc.get()) {
555         ALOGE("Convolve5x5 executed without input, skipping");
556         return;
557     }
558     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
559     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
560 
561     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
562     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
563     uint32_t y2 = info->current.y;
564     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
565     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
566 
567     const float2 *py0 = (const float2 *)(pin + stride * y0);
568     const float2 *py1 = (const float2 *)(pin + stride * y1);
569     const float2 *py2 = (const float2 *)(pin + stride * y2);
570     const float2 *py3 = (const float2 *)(pin + stride * y3);
571     const float2 *py4 = (const float2 *)(pin + stride * y4);
572 
573     float2 *out = (float2 *)info->outPtr[0];
574     uint32_t x1 = xstart;
575     uint32_t x2 = xend;
576 
577     while((x1 < x2) && (x1 < 2)) {
578         OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
579         out++;
580         x1++;
581     }
582 
583 #if 0//defined(ARCH_ARM_HAVE_NEON)
584     if((x1 + 3) < x2) {
585         uint32_t len = (x2 - x1 - 3) >> 1;
586         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
587         out += len << 1;
588         x1 += len << 1;
589     }
590 #endif
591 
592     while(x1 < x2) {
593         OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
594         out++;
595         x1++;
596     }
597 }
598 
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)599 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
600                                                 uint32_t xstart, uint32_t xend,
601                                                 uint32_t outstep) {
602     RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
603     if (!cp->alloc.get()) {
604         ALOGE("Convolve5x5 executed without input, skipping");
605         return;
606     }
607     const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
608     const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
609 
610     uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
611     uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
612     uint32_t y2 = info->current.y;
613     uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
614     uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
615 
616     const float *py0 = (const float *)(pin + stride * y0);
617     const float *py1 = (const float *)(pin + stride * y1);
618     const float *py2 = (const float *)(pin + stride * y2);
619     const float *py3 = (const float *)(pin + stride * y3);
620     const float *py4 = (const float *)(pin + stride * y4);
621 
622     float *out = (float *)info->outPtr[0];
623     uint32_t x1 = xstart;
624     uint32_t x2 = xend;
625 
626     while((x1 < x2) && (x1 < 2)) {
627         OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
628         out++;
629         x1++;
630     }
631 
632 #if 0//defined(ARCH_ARM_HAVE_NEON)
633     if((x1 + 3) < x2) {
634         uint32_t len = (x2 - x1 - 3) >> 1;
635         rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
636         out += len << 1;
637         x1 += len << 1;
638     }
639 #endif
640 
641     while(x1 < x2) {
642         OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
643         out++;
644         x1++;
645     }
646 }
647 
RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)648 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
649             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
650             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
651 
652     if (e->getType() == RS_TYPE_FLOAT_32) {
653         switch(e->getVectorSize()) {
654         case 1:
655             mRootPtr = &kernelF1;
656             break;
657         case 2:
658             mRootPtr = &kernelF2;
659             break;
660         case 3:
661         case 4:
662             mRootPtr = &kernelF4;
663             break;
664         }
665     } else {
666         switch(e->getVectorSize()) {
667         case 1:
668             mRootPtr = &kernelU1;
669             break;
670         case 2:
671             mRootPtr = &kernelU2;
672             break;
673         case 3:
674         case 4:
675             mRootPtr = &kernelU4;
676             break;
677         }
678     }
679     for(int ct=0; ct < 25; ct++) {
680         mFp[ct] = 1.f / 25.f;
681         mIp[ct] = (int16_t)(mFp[ct] * 256.f);
682     }
683 }
684 
~RsdCpuScriptIntrinsicConvolve5x5()685 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
686 }
687 
populateScript(Script * s)688 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
689     s->mHal.info.exportedVariableCount = 2;
690 }
691 
invokeFreeChildren()692 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
693     alloc.clear();
694 }
695 
rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)696 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
697                                             const Script *s, const Element *e) {
698 
699     return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
700 }
701 
702 } // namespace renderscript
703 } // namespace android
704