1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "rsCpuIntrinsic.h"
18 #include "rsCpuIntrinsicInlines.h"
19 
20 namespace android {
21 namespace renderscript {
22 
23 
24 class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
25 public:
26     void populateScript(Script *) override;
27     void invokeFreeChildren() override;
28 
29     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
30     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
31 
32     ~RsdCpuScriptIntrinsicHistogram() override;
33     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
34 
35 protected:
36     void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
37                    Allocation * aout, const void * usr,
38                    uint32_t usrLen, const RsScriptCall *sc);
39     void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
40                     Allocation * aout, const void * usr,
41                     uint32_t usrLen, const RsScriptCall *sc);
42 
43 
44     float mDot[4];
45     int mDotI[4];
46     int *mSums;
47     ObjectBaseRef<Allocation> mAllocOut;
48 
49     static void kernelP1U4(const RsExpandKernelDriverInfo *info,
50                            uint32_t xstart, uint32_t xend,
51                            uint32_t outstep);
52     static void kernelP1U3(const RsExpandKernelDriverInfo *info,
53                            uint32_t xstart, uint32_t xend,
54                            uint32_t outstep);
55     static void kernelP1U2(const RsExpandKernelDriverInfo *info,
56                            uint32_t xstart, uint32_t xend,
57                            uint32_t outstep);
58     static void kernelP1U1(const RsExpandKernelDriverInfo *info,
59                            uint32_t xstart, uint32_t xend,
60                            uint32_t outstep);
61 
62     static void kernelP1L4(const RsExpandKernelDriverInfo *info,
63                            uint32_t xstart, uint32_t xend,
64                            uint32_t outstep);
65     static void kernelP1L3(const RsExpandKernelDriverInfo *info,
66                            uint32_t xstart, uint32_t xend,
67                            uint32_t outstep);
68     static void kernelP1L2(const RsExpandKernelDriverInfo *info,
69                            uint32_t xstart, uint32_t xend,
70                            uint32_t outstep);
71     static void kernelP1L1(const RsExpandKernelDriverInfo *info,
72                            uint32_t xstart, uint32_t xend,
73                            uint32_t outstep);
74 
75 };
76 
setGlobalObj(uint32_t slot,ObjectBase * data)77 void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
78     rsAssert(slot == 1);
79     mAllocOut.set(static_cast<Allocation *>(data));
80 }
81 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)82 void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
83     rsAssert(slot == 0);
84     rsAssert(dataLength == 16);
85     memcpy(mDot, data, 16);
86     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
87     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
88     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
89     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
90 }
91 
92 
93 
94 void
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)95 RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
96                                           const Allocation ** ains,
97                                           uint32_t inLen, Allocation * aout,
98                                           const void * usr, uint32_t usrLen,
99                                           const RsScriptCall *sc) {
100 
101     const uint32_t threads = mCtx->getThreadCount();
102     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
103 
104     switch (slot) {
105     case 0:
106         switch(vSize) {
107         case 1:
108             mRootPtr = &kernelP1U1;
109             break;
110         case 2:
111             mRootPtr = &kernelP1U2;
112             break;
113         case 3:
114             mRootPtr = &kernelP1U3;
115             vSize = 4;
116             break;
117         case 4:
118             mRootPtr = &kernelP1U4;
119             break;
120         }
121         break;
122     case 1:
123         switch(ains[0]->getType()->getElement()->getVectorSize()) {
124         case 1:
125             mRootPtr = &kernelP1L1;
126             break;
127         case 2:
128             mRootPtr = &kernelP1L2;
129             break;
130         case 3:
131             mRootPtr = &kernelP1L3;
132             break;
133         case 4:
134             mRootPtr = &kernelP1L4;
135             break;
136         }
137         break;
138     }
139     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
140 }
141 
142 void
postLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)143 RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
144                                            const Allocation ** ains,
145                                            uint32_t inLen,  Allocation * aout,
146                                            const void * usr, uint32_t usrLen,
147                                            const RsScriptCall *sc) {
148 
149     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
150     uint32_t threads = mCtx->getThreadCount();
151     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
152 
153     if (vSize == 3) vSize = 4;
154 
155     for (uint32_t ct=0; ct < (256 * vSize); ct++) {
156         o[ct] = mSums[ct];
157         for (uint32_t t=1; t < threads; t++) {
158             o[ct] += mSums[ct + (256 * vSize * t)];
159         }
160     }
161 }
162 
kernelP1U4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)163 void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelDriverInfo *info,
164                                                 uint32_t xstart, uint32_t xend,
165                                                 uint32_t outstep) {
166 
167     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
168     uchar *in = (uchar *)info->inPtr[0];
169     int * sums = &cp->mSums[256 * 4 * info->lid];
170 
171     for (uint32_t x = xstart; x < xend; x++) {
172         sums[(in[0] << 2)    ] ++;
173         sums[(in[1] << 2) + 1] ++;
174         sums[(in[2] << 2) + 2] ++;
175         sums[(in[3] << 2) + 3] ++;
176         in += info->inStride[0];
177     }
178 }
179 
kernelP1U3(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)180 void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelDriverInfo *info,
181                                                 uint32_t xstart, uint32_t xend,
182                                                 uint32_t outstep) {
183 
184     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
185     uchar *in = (uchar *)info->inPtr[0];
186     int * sums = &cp->mSums[256 * 4 * info->lid];
187 
188     for (uint32_t x = xstart; x < xend; x++) {
189         sums[(in[0] << 2)    ] ++;
190         sums[(in[1] << 2) + 1] ++;
191         sums[(in[2] << 2) + 2] ++;
192         in += info->inStride[0];
193     }
194 }
195 
kernelP1U2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)196 void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelDriverInfo *info,
197                                                 uint32_t xstart, uint32_t xend,
198                                                 uint32_t outstep) {
199 
200     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
201     uchar *in = (uchar *)info->inPtr[0];
202     int * sums = &cp->mSums[256 * 2 * info->lid];
203 
204     for (uint32_t x = xstart; x < xend; x++) {
205         sums[(in[0] << 1)    ] ++;
206         sums[(in[1] << 1) + 1] ++;
207         in += info->inStride[0];
208     }
209 }
210 
kernelP1L4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)211 void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelDriverInfo *info,
212                                                 uint32_t xstart, uint32_t xend,
213                                                 uint32_t outstep) {
214 
215     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
216     uchar *in = (uchar *)info->inPtr[0];
217     int * sums = &cp->mSums[256 * info->lid];
218 
219     for (uint32_t x = xstart; x < xend; x++) {
220         int t = (cp->mDotI[0] * in[0]) +
221                 (cp->mDotI[1] * in[1]) +
222                 (cp->mDotI[2] * in[2]) +
223                 (cp->mDotI[3] * in[3]);
224         sums[(t + 0x7f) >> 8] ++;
225         in += info->inStride[0];
226     }
227 }
228 
kernelP1L3(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)229 void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelDriverInfo *info,
230                                                 uint32_t xstart, uint32_t xend,
231                                                 uint32_t outstep) {
232 
233     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
234     uchar *in = (uchar *)info->inPtr[0];
235     int * sums = &cp->mSums[256 * info->lid];
236 
237     for (uint32_t x = xstart; x < xend; x++) {
238         int t = (cp->mDotI[0] * in[0]) +
239                 (cp->mDotI[1] * in[1]) +
240                 (cp->mDotI[2] * in[2]);
241         sums[(t + 0x7f) >> 8] ++;
242         in += info->inStride[0];
243     }
244 }
245 
kernelP1L2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)246 void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelDriverInfo *info,
247                                                 uint32_t xstart, uint32_t xend,
248                                                 uint32_t outstep) {
249 
250     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
251     uchar *in = (uchar *)info->inPtr[0];
252     int * sums = &cp->mSums[256 * info->lid];
253 
254     for (uint32_t x = xstart; x < xend; x++) {
255         int t = (cp->mDotI[0] * in[0]) +
256                 (cp->mDotI[1] * in[1]);
257         sums[(t + 0x7f) >> 8] ++;
258         in += info->inStride[0];
259     }
260 }
261 
kernelP1L1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)262 void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelDriverInfo *info,
263                                                 uint32_t xstart, uint32_t xend,
264                                                 uint32_t outstep) {
265 
266     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
267     uchar *in = (uchar *)info->inPtr[0];
268     int * sums = &cp->mSums[256 * info->lid];
269 
270     for (uint32_t x = xstart; x < xend; x++) {
271         int t = (cp->mDotI[0] * in[0]);
272         sums[(t + 0x7f) >> 8] ++;
273         in += info->inStride[0];
274     }
275 }
276 
kernelP1U1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)277 void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelDriverInfo *info,
278                                                 uint32_t xstart, uint32_t xend,
279                                                 uint32_t outstep) {
280 
281     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
282     uchar *in = (uchar *)info->inPtr[0];
283     int * sums = &cp->mSums[256 * info->lid];
284 
285     for (uint32_t x = xstart; x < xend; x++) {
286         sums[in[0]] ++;
287         in += info->inStride[0];
288     }
289 }
290 
291 
RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)292 RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
293                                                      const Script *s, const Element *e)
294             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
295 
296     mRootPtr = nullptr;
297     mSums = new int[256 * 4 * mCtx->getThreadCount()];
298     mDot[0] = 0.299f;
299     mDot[1] = 0.587f;
300     mDot[2] = 0.114f;
301     mDot[3] = 0;
302     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
303     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
304     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
305     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
306 }
307 
~RsdCpuScriptIntrinsicHistogram()308 RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
309     if (mSums) {
310         delete []mSums;
311     }
312 }
313 
populateScript(Script * s)314 void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
315     s->mHal.info.exportedVariableCount = 2;
316 }
317 
invokeFreeChildren()318 void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
319 }
320 
rsdIntrinsic_Histogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)321 RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
322 
323     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
324 }
325 
326 } // namespace renderscript
327 } // namespace android
328