1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "rsCpuIntrinsic.h"
18 #include "rsCpuIntrinsicInlines.h"
19 
20 namespace android {
21 namespace renderscript {
22 
23 class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
24 public:
25     void populateScript(Script *) override;
26     void invokeFreeChildren() override;
27 
28     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
29     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
30 
31     ~RsdCpuScriptIntrinsicBlur() override;
32     RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
33 
34 protected:
35     // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
36     // So, the max kernel size is 51 (= 2 * 25 + 1).
37     // Considering SSSE3 case, which requires the size is multiple of 4,
38     // at least 52 words are necessary. Values outside of the kernel should be 0.
39     float mFp[104];
40     uint16_t mIp[104];
41     void **mScratch;
42     size_t *mScratchSize;
43     float mRadius;
44     int mIradius;
45     ObjectBaseRef<Allocation> mAlloc;
46 
47     static void kernelU4(const RsExpandKernelDriverInfo *info,
48                          uint32_t xstart, uint32_t xend,
49                          uint32_t outstep);
50     static void kernelU1(const RsExpandKernelDriverInfo *info,
51                          uint32_t xstart, uint32_t xend,
52                          uint32_t outstep);
53     void ComputeGaussianWeights();
54 };
55 
56 
ComputeGaussianWeights()57 void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
58     memset(mFp, 0, sizeof(mFp));
59     memset(mIp, 0, sizeof(mIp));
60 
61     // Compute gaussian weights for the blur
62     // e is the euler's number
63     // TODO Define these constants only once
64     float e = 2.718281828459045f;
65     float pi = 3.1415926535897932f;
66     // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
67     // x is of the form [-radius .. 0 .. radius]
68     // and sigma varies with the radius.
69     // Based on some experimental radius values and sigmas,
70     // we approximately fit sigma = f(radius) as
71     // sigma = radius * 0.4  + 0.6
72     // The larger the radius gets, the more our gaussian blur
73     // will resemble a box blur since with large sigma
74     // the gaussian curve begins to lose its shape
75     float sigma = 0.4f * mRadius + 0.6f;
76 
77     // Now compute the coefficients. We will store some redundant values to save
78     // some math during the blur calculations precompute some values
79     float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
80     float coeff2 = - 1.0f / (2.0f * sigma * sigma);
81 
82     float normalizeFactor = 0.0f;
83     float floatR = 0.0f;
84     int r;
85     mIradius = (float)ceil(mRadius) + 0.5f;
86     for (r = -mIradius; r <= mIradius; r ++) {
87         floatR = (float)r;
88         mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
89         normalizeFactor += mFp[r + mIradius];
90     }
91 
92     // Now we need to normalize the weights because all our coefficients need to add up to one
93     normalizeFactor = 1.0f / normalizeFactor;
94     for (r = -mIradius; r <= mIradius; r ++) {
95         mFp[r + mIradius] *= normalizeFactor;
96         mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
97     }
98 }
99 
setGlobalObj(uint32_t slot,ObjectBase * data)100 void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
101     rsAssert(slot == 1);
102     mAlloc.set(static_cast<Allocation *>(data));
103 }
104 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)105 void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
106     rsAssert(slot == 0);
107     mRadius = ((const float *)data)[0];
108     ComputeGaussianWeights();
109 }
110 
111 
112 
OneVU4(const RsExpandKernelDriverInfo * info,float4 * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)113 static void OneVU4(const RsExpandKernelDriverInfo *info, float4 *out, int32_t x, int32_t y,
114                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
115 
116     const uchar *pi = ptrIn + x*4;
117 
118     float4 blurredPixel = 0;
119     for (int r = -iradius; r <= iradius; r ++) {
120         int validY = rsMax((y + r), 0);
121         validY = rsMin(validY, (int)(info->dim.y- 1));
122         const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
123         float4 pf = convert_float4(pvy[0]);
124         blurredPixel += pf * gPtr[0];
125         gPtr++;
126     }
127 
128     out[0] = blurredPixel;
129 }
130 
OneVU1(const RsExpandKernelDriverInfo * info,float * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)131 static void OneVU1(const RsExpandKernelDriverInfo *info, float *out, int32_t x, int32_t y,
132                    const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
133 
134     const uchar *pi = ptrIn + x;
135 
136     float blurredPixel = 0;
137     for (int r = -iradius; r <= iradius; r ++) {
138         int validY = rsMax((y + r), 0);
139         validY = rsMin(validY, (int)(info->dim.y - 1));
140         float pf = (float)pi[validY * iStride];
141         blurredPixel += pf * gPtr[0];
142         gPtr++;
143     }
144 
145     out[0] = blurredPixel;
146 }
147 
148 } // namespace renderscript
149 } // namespace android
150 
151 
152 extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
153                  size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
154 extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
155                  size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
156 
157 #if defined(ARCH_X86_HAVE_SSSE3)
158 extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
159 extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
160 extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
161 #endif
162 
163 using android::renderscript::gArchUseSIMD;
164 
OneVFU4(float4 * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x1,int x2)165 static void OneVFU4(float4 *out,
166                     const uchar *ptrIn, int iStride, const float* gPtr, int ct,
167                     int x1, int x2) {
168     out += x1;
169 #if defined(ARCH_X86_HAVE_SSSE3)
170     if (gArchUseSIMD) {
171         int t = (x2 - x1);
172         t &= ~1;
173         if (t) {
174             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
175         }
176         x1 += t;
177         out += t;
178         ptrIn += t << 2;
179     }
180 #endif
181     while(x2 > x1) {
182         const uchar *pi = ptrIn;
183         float4 blurredPixel = 0;
184         const float* gp = gPtr;
185 
186         for (int r = 0; r < ct; r++) {
187             float4 pf = convert_float4(((const uchar4 *)pi)[0]);
188             blurredPixel += pf * gp[0];
189             pi += iStride;
190             gp++;
191         }
192         out->xyzw = blurredPixel;
193         x1++;
194         out++;
195         ptrIn+=4;
196     }
197 }
198 
OneVFU1(float * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x1,int x2)199 static void OneVFU1(float *out,
200                     const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
201 
202     int len = x2 - x1;
203     out += x1;
204 
205     while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
206         const uchar *pi = ptrIn;
207         float blurredPixel = 0;
208         const float* gp = gPtr;
209 
210         for (int r = 0; r < ct; r++) {
211             float pf = (float)pi[0];
212             blurredPixel += pf * gp[0];
213             pi += iStride;
214             gp++;
215         }
216         out[0] = blurredPixel;
217         x1++;
218         out++;
219         ptrIn++;
220         len--;
221     }
222 #if defined(ARCH_X86_HAVE_SSSE3)
223     if (gArchUseSIMD && (x2 > x1)) {
224         int t = (x2 - x1) >> 2;
225         t &= ~1;
226         if (t) {
227             rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
228             len -= t << 2;
229             ptrIn += t << 2;
230             out += t << 2;
231         }
232     }
233 #endif
234     while(len > 0) {
235         const uchar *pi = ptrIn;
236         float blurredPixel = 0;
237         const float* gp = gPtr;
238 
239         for (int r = 0; r < ct; r++) {
240             float pf = (float)pi[0];
241             blurredPixel += pf * gp[0];
242             pi += iStride;
243             gp++;
244         }
245         out[0] = blurredPixel;
246         len--;
247         out++;
248         ptrIn++;
249     }
250 }
251 
252 using android::renderscript::rsMin;
253 using android::renderscript::rsMax;
254 
OneHU4(const RsExpandKernelDriverInfo * info,uchar4 * out,int32_t x,const float4 * ptrIn,const float * gPtr,int iradius)255 static void OneHU4(const RsExpandKernelDriverInfo *info, uchar4 *out, int32_t x,
256                    const float4 *ptrIn, const float* gPtr, int iradius) {
257 
258     float4 blurredPixel = 0;
259     for (int r = -iradius; r <= iradius; r ++) {
260         int validX = rsMax((x + r), 0);
261         validX = rsMin(validX, (int)(info->dim.x - 1));
262         float4 pf = ptrIn[validX];
263         blurredPixel += pf * gPtr[0];
264         gPtr++;
265     }
266 
267     out->xyzw = convert_uchar4(blurredPixel);
268 }
269 
OneHU1(const RsExpandKernelDriverInfo * info,uchar * out,int32_t x,const float * ptrIn,const float * gPtr,int iradius)270 static void OneHU1(const RsExpandKernelDriverInfo *info, uchar *out, int32_t x,
271                    const float *ptrIn, const float* gPtr, int iradius) {
272 
273     float blurredPixel = 0;
274     for (int r = -iradius; r <= iradius; r ++) {
275         int validX = rsMax((x + r), 0);
276         validX = rsMin(validX, (int)(info->dim.x - 1));
277         float pf = ptrIn[validX];
278         blurredPixel += pf * gPtr[0];
279         gPtr++;
280     }
281 
282     out[0] = (uchar)blurredPixel;
283 }
284 
285 
286 namespace android {
287 namespace renderscript {
288 
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)289 void RsdCpuScriptIntrinsicBlur::kernelU4(const RsExpandKernelDriverInfo *info,
290                                          uint32_t xstart, uint32_t xend,
291                                          uint32_t outstep) {
292 
293     float4 stackbuf[2048];
294     float4 *buf = &stackbuf[0];
295     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
296     if (!cp->mAlloc.get()) {
297         ALOGE("Blur executed without input, skipping");
298         return;
299     }
300     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
301     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
302 
303     uchar4 *out = (uchar4 *)info->outPtr[0];
304     uint32_t x1 = xstart;
305     uint32_t x2 = xend;
306 
307 #if defined(ARCH_ARM_USE_INTRINSICS)
308     if (gArchUseSIMD && info->dim.x >= 4) {
309       rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * info->current.y),
310                  info->dim.x, info->dim.y,
311                  stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
312         return;
313     }
314 #endif
315 
316     if (info->dim.x > 2048) {
317         if ((info->dim.x > cp->mScratchSize[info->lid]) || !cp->mScratch[info->lid]) {
318             // Pad the side of the allocation by one unit to allow alignment later
319             cp->mScratch[info->lid] = realloc(cp->mScratch[info->lid], (info->dim.x + 1) * 16);
320             cp->mScratchSize[info->lid] = info->dim.x;
321         }
322         // realloc only aligns to 8 bytes so we manually align to 16.
323         buf = (float4 *) ((((intptr_t)cp->mScratch[info->lid]) + 15) & ~0xf);
324     }
325     float4 *fout = (float4 *)buf;
326     int y = info->current.y;
327     if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius))) {
328         const uchar *pi = pin + (y - cp->mIradius) * stride;
329         OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
330     } else {
331         x1 = 0;
332         while(info->dim.x > x1) {
333             OneVU4(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
334             fout++;
335             x1++;
336         }
337     }
338 
339     x1 = xstart;
340     while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
341         OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
342         out++;
343         x1++;
344     }
345 #if defined(ARCH_X86_HAVE_SSSE3)
346     if (gArchUseSIMD) {
347         if ((x1 + cp->mIradius) < x2) {
348             rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
349                                    cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
350             out += (x2 - cp->mIradius) - x1;
351             x1 = x2 - cp->mIradius;
352         }
353     }
354 #endif
355     while(x2 > x1) {
356         OneHU4(info, out, x1, buf, cp->mFp, cp->mIradius);
357         out++;
358         x1++;
359     }
360 }
361 
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)362 void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelDriverInfo *info,
363                                          uint32_t xstart, uint32_t xend,
364                                          uint32_t outstep) {
365     float buf[4 * 2048];
366     RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
367     if (!cp->mAlloc.get()) {
368         ALOGE("Blur executed without input, skipping");
369         return;
370     }
371     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
372     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
373 
374     uchar *out = (uchar *)info->outPtr[0];
375     uint32_t x1 = xstart;
376     uint32_t x2 = xend;
377 
378 #if defined(ARCH_ARM_USE_INTRINSICS)
379     if (gArchUseSIMD && info->dim.x >= 16) {
380         // The specialisation for r<=8 has an awkward prefill case, which is
381         // fiddly to resolve, where starting close to the right edge can cause
382         // a read beyond the end of input.  So avoid that case here.
383         if (cp->mIradius > 8 || (info->dim.x - rsMax(0, (int32_t)x1 - 8)) >= 16) {
384             rsdIntrinsicBlurU1_K(out, pin + stride * info->current.y, info->dim.x, info->dim.y,
385                      stride, x1, info->current.y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
386             return;
387         }
388     }
389 #endif
390 
391     float *fout = (float *)buf;
392     int y = info->current.y;
393     if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius -1))) {
394         const uchar *pi = pin + (y - cp->mIradius) * stride;
395         OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
396     } else {
397         x1 = 0;
398         while(info->dim.x > x1) {
399             OneVU1(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
400             fout++;
401             x1++;
402         }
403     }
404 
405     x1 = xstart;
406     while ((x1 < x2) &&
407            ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
408         OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
409         out++;
410         x1++;
411     }
412 #if defined(ARCH_X86_HAVE_SSSE3)
413     if (gArchUseSIMD) {
414         if ((x1 + cp->mIradius) < x2) {
415             uint32_t len = x2 - (x1 + cp->mIradius);
416             len &= ~3;
417 
418             // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
419             // nees to ensure four more values can be accessed in order to avoid accessing
420             // uninitialized buffer.
421             if (len > 4) {
422                 len -= 4;
423                 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
424                                        cp->mIradius * 2 + 1, x1, x1 + len);
425                 out += len;
426                 x1 += len;
427             }
428         }
429     }
430 #endif
431     while(x2 > x1) {
432         OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
433         out++;
434         x1++;
435     }
436 }
437 
RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)438 RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
439                                                      const Script *s, const Element *e)
440             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
441 
442     mRootPtr = nullptr;
443     if (e->getType() == RS_TYPE_UNSIGNED_8) {
444         switch (e->getVectorSize()) {
445         case 1:
446             mRootPtr = &kernelU1;
447             break;
448         case 4:
449             mRootPtr = &kernelU4;
450             break;
451         }
452     }
453     rsAssert(mRootPtr);
454     mRadius = 5;
455 
456     mScratch = new void *[mCtx->getThreadCount()];
457     mScratchSize = new size_t[mCtx->getThreadCount()];
458     memset(mScratch, 0, sizeof(void *) * mCtx->getThreadCount());
459     memset(mScratchSize, 0, sizeof(size_t) * mCtx->getThreadCount());
460 
461     ComputeGaussianWeights();
462 }
463 
~RsdCpuScriptIntrinsicBlur()464 RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
465     uint32_t threads = mCtx->getThreadCount();
466     if (mScratch) {
467         for (size_t i = 0; i < threads; i++) {
468             if (mScratch[i]) {
469                 free(mScratch[i]);
470             }
471         }
472         delete []mScratch;
473     }
474     if (mScratchSize) {
475         delete []mScratchSize;
476     }
477 }
478 
populateScript(Script * s)479 void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
480     s->mHal.info.exportedVariableCount = 2;
481 }
482 
invokeFreeChildren()483 void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
484     mAlloc.clear();
485 }
486 
rsdIntrinsic_Blur(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)487 RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
488 
489     return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
490 }
491 
492 } // namespace renderscript
493 } // namespace android
494