1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #if defined(ARCH_X86_HAVE_AVX2)
18 #include <stdint.h>
19 #include <x86intrin.h>
20 #include <xmmintrin.h>
21 #endif
22 
23 #include "rsCpuIntrinsic.h"
24 #include "rsCpuIntrinsicInlines.h"
25 
26 namespace android {
27 namespace renderscript {
28 
29 
30 class RsdCpuScriptIntrinsicResize : public RsdCpuScriptIntrinsic {
31 public:
32     void populateScript(Script *) override;
33     void invokeFreeChildren() override;
34 
35     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
36 
37     ~RsdCpuScriptIntrinsicResize() override;
38     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
39 
40     void preLaunch(uint32_t slot, const Allocation ** ains,
41                    uint32_t inLen, Allocation * aout, const void * usr,
42                    uint32_t usrLen, const RsScriptCall *sc) override;
43 
44     float scaleX;
45     float scaleY;
46 
47 protected:
48     ObjectBaseRef<const Allocation> mAlloc;
49     ObjectBaseRef<const Element> mElement;
50 
51     static void kernelU1(const RsExpandKernelDriverInfo *info,
52                          uint32_t xstart, uint32_t xend,
53                          uint32_t outstep);
54     static void kernelU2(const RsExpandKernelDriverInfo *info,
55                          uint32_t xstart, uint32_t xend,
56                          uint32_t outstep);
57     static void kernelU4(const RsExpandKernelDriverInfo *info,
58                          uint32_t xstart, uint32_t xend,
59                          uint32_t outstep);
60     static void kernelF1(const RsExpandKernelDriverInfo *info,
61                          uint32_t xstart, uint32_t xend,
62                          uint32_t outstep);
63     static void kernelF2(const RsExpandKernelDriverInfo *info,
64                          uint32_t xstart, uint32_t xend,
65                          uint32_t outstep);
66     static void kernelF4(const RsExpandKernelDriverInfo *info,
67                          uint32_t xstart, uint32_t xend,
68                          uint32_t outstep);
69 };
70 
setGlobalObj(uint32_t slot,ObjectBase * data)71 void RsdCpuScriptIntrinsicResize::setGlobalObj(uint32_t slot, ObjectBase *data) {
72     rsAssert(slot == 0);
73     mAlloc.set(static_cast<Allocation *>(data));
74 }
75 
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)76 static float4 cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3, float x) {
77     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
78             + x * (3.f * (p1 - p2) + p3 - p0)));
79 }
80 
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)81 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
82     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
83             + x * (3.f * (p1 - p2) + p3 - p0)));
84 }
85 
86 
87 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)88 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
89    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
90            _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
91            + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),_mm_set1_ps(p3 - p0))))));
92 
93 }
94 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)95 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
96     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
97             + x * (3.f * (p1 - p2) + p3 - p0)));
98 }
99 #endif
100 
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)101 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
102                          float xf, float yf, int width) {
103     int startx = (int) floor(xf - 1);
104     xf = xf - floor(xf);
105     int maxx = width - 1;
106     int xs0 = rsMax(0, startx + 0);
107     int xs1 = rsMax(0, startx + 1);
108     int xs2 = rsMin(maxx, startx + 2);
109     int xs3 = rsMin(maxx, startx + 3);
110 
111     float4 p0  = cubicInterpolate(convert_float4(yp0[xs0]),
112                                   convert_float4(yp0[xs1]),
113                                   convert_float4(yp0[xs2]),
114                                   convert_float4(yp0[xs3]), xf);
115 
116     float4 p1  = cubicInterpolate(convert_float4(yp1[xs0]),
117                                   convert_float4(yp1[xs1]),
118                                   convert_float4(yp1[xs2]),
119                                   convert_float4(yp1[xs3]), xf);
120 
121     float4 p2  = cubicInterpolate(convert_float4(yp2[xs0]),
122                                   convert_float4(yp2[xs1]),
123                                   convert_float4(yp2[xs2]),
124                                   convert_float4(yp2[xs3]), xf);
125 
126     float4 p3  = cubicInterpolate(convert_float4(yp3[xs0]),
127                                   convert_float4(yp3[xs1]),
128                                   convert_float4(yp3[xs2]),
129                                   convert_float4(yp3[xs3]), xf);
130 
131     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
132     p = clamp(p + 0.5f, 0.f, 255.f);
133     return convert_uchar4(p);
134 }
135 
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)136 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
137                          float xf, float yf, int width) {
138     int startx = (int) floor(xf - 1);
139     xf = xf - floor(xf);
140     int maxx = width - 1;
141     int xs0 = rsMax(0, startx + 0);
142     int xs1 = rsMax(0, startx + 1);
143     int xs2 = rsMin(maxx, startx + 2);
144     int xs3 = rsMin(maxx, startx + 3);
145 
146     float2 p0  = cubicInterpolate(convert_float2(yp0[xs0]),
147                                   convert_float2(yp0[xs1]),
148                                   convert_float2(yp0[xs2]),
149                                   convert_float2(yp0[xs3]), xf);
150 
151     float2 p1  = cubicInterpolate(convert_float2(yp1[xs0]),
152                                   convert_float2(yp1[xs1]),
153                                   convert_float2(yp1[xs2]),
154                                   convert_float2(yp1[xs3]), xf);
155 
156     float2 p2  = cubicInterpolate(convert_float2(yp2[xs0]),
157                                   convert_float2(yp2[xs1]),
158                                   convert_float2(yp2[xs2]),
159                                   convert_float2(yp2[xs3]), xf);
160 
161     float2 p3  = cubicInterpolate(convert_float2(yp3[xs0]),
162                                   convert_float2(yp3[xs1]),
163                                   convert_float2(yp3[xs2]),
164                                   convert_float2(yp3[xs3]), xf);
165 
166     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
167     p = clamp(p + 0.5f, 0.f, 255.f);
168     return convert_uchar2(p);
169 }
170 
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)171 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
172                         float xf, float yf, int width) {
173     int startx = (int) floor(xf - 1);
174     xf = xf - floor(xf);
175     int maxx = width - 1;
176     int xs0 = rsMax(0, startx + 0);
177     int xs1 = rsMax(0, startx + 1);
178     int xs2 = rsMin(maxx, startx + 2);
179     int xs3 = rsMin(maxx, startx + 3);
180 
181     float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
182                                  (float)yp0[xs2], (float)yp0[xs3], xf);
183     float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
184                                  (float)yp1[xs2], (float)yp1[xs3], xf);
185     float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
186                                  (float)yp2[xs2], (float)yp2[xs3], xf);
187     float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
188                                  (float)yp3[xs2], (float)yp3[xs3], xf);
189 
190     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
191     p = clamp(p + 0.5f, 0.f, 255.f);
192     return (uchar)p;
193 }
194 
195 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
196 
197 extern "C" void rsdIntrinsicResizeB4_K(
198             uchar4 *dst,
199             size_t count,
200             uint32_t xf,
201             uint32_t xinc,
202             uchar4 const *srcn,
203             uchar4 const *src0,
204             uchar4 const *src1,
205             uchar4 const *src2,
206             size_t xclip,
207             size_t avail,
208             uint64_t osc_ctl,
209             int32_t const *yr);
210 
211 extern "C" void rsdIntrinsicResizeB2_K(
212             uchar2 *dst,
213             size_t count,
214             uint32_t xf,
215             uint32_t xinc,
216             uchar2 const *srcn,
217             uchar2 const *src0,
218             uchar2 const *src1,
219             uchar2 const *src2,
220             size_t xclip,
221             size_t avail,
222             uint64_t osc_ctl,
223             int32_t const *yr);
224 
225 extern "C" void rsdIntrinsicResizeB1_K(
226             uchar *dst,
227             size_t count,
228             uint32_t xf,
229             uint32_t xinc,
230             uchar const *srcn,
231             uchar const *src0,
232             uchar const *src1,
233             uchar const *src2,
234             size_t xclip,
235             size_t avail,
236             uint64_t osc_ctl,
237             int32_t const *yr);
238 
239 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)240 static void mkYCoeff(int32_t *yr, float yf) {
241     int32_t yf1 = rint(yf * 0x10000);
242     int32_t yf2 = rint(yf * yf * 0x10000);
243     int32_t yf3 = rint(yf * yf * yf * 0x10000);
244 
245     yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
246     yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
247     yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
248     yr[3] = -(yf3 - yf2) >> 1;
249 }
250 #endif
251 
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)252 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
253                          float xf, float yf, int width) {
254     int startx = (int) floor(xf - 1);
255     xf = xf - floor(xf);
256     int maxx = width - 1;
257     int xs0 = rsMax(0, startx + 0);
258     int xs1 = rsMax(0, startx + 1);
259     int xs2 = rsMin(maxx, startx + 2);
260     int xs3 = rsMin(maxx, startx + 3);
261 
262     float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
263                                   yp0[xs2], yp0[xs3], xf);
264     float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
265                                   yp1[xs2], yp1[xs3], xf);
266     float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
267                                   yp2[xs2], yp2[xs3], xf);
268     float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
269                                   yp3[xs2], yp3[xs3], xf);
270 
271     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
272     return p;
273 }
274 
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)275 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
276                          float xf, float yf, int width) {
277     int startx = (int) floor(xf - 1);
278     xf = xf - floor(xf);
279     int maxx = width - 1;
280     int xs0 = rsMax(0, startx + 0);
281     int xs1 = rsMax(0, startx + 1);
282     int xs2 = rsMin(maxx, startx + 2);
283     int xs3 = rsMin(maxx, startx + 3);
284 
285     float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
286                                   yp0[xs2], yp0[xs3], xf);
287     float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
288                                   yp1[xs2], yp1[xs3], xf);
289     float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
290                                   yp2[xs2], yp2[xs3], xf);
291     float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
292                                   yp3[xs2], yp3[xs3], xf);
293 
294     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
295     return p;
296 }
297 
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)298 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
299                         float xf, float yf, int width) {
300     int startx = (int) floor(xf - 1);
301     xf = xf - floor(xf);
302     int maxx = width - 1;
303     int xs0 = rsMax(0, startx + 0);
304     int xs1 = rsMax(0, startx + 1);
305     int xs2 = rsMin(maxx, startx + 2);
306     int xs3 = rsMin(maxx, startx + 3);
307 
308     float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
309                                  yp0[xs2], yp0[xs3], xf);
310     float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
311                                  yp1[xs2], yp1[xs3], xf);
312     float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
313                                  yp2[xs2], yp2[xs3], xf);
314     float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
315                                  yp3[xs2], yp3[xs3], xf);
316 
317     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
318     return p;
319 }
320 
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)321 void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelDriverInfo *info,
322                                                 uint32_t xstart, uint32_t xend,
323                                                 uint32_t outstep) {
324     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
325 
326     if (!cp->mAlloc.get()) {
327         ALOGE("Resize executed without input, skipping");
328         return;
329     }
330     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
331     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
332     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
333     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
334 
335 
336 #if defined(ARCH_X86_HAVE_AVX2)
337     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
338 #else
339     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
340 #endif
341 
342 
343     int starty = (int) floor(yf - 1);
344     yf = yf - floor(yf);
345     int maxy = srcHeight - 1;
346     int ys0 = rsMax(0, starty + 0);
347     int ys1 = rsMax(0, starty + 1);
348     int ys2 = rsMin(maxy, starty + 2);
349     int ys3 = rsMin(maxy, starty + 3);
350 
351     const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
352     const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
353     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
354     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
355 
356     uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
357     uint32_t x1 = xstart;
358     uint32_t x2 = xend;
359 
360 #if defined(ARCH_ARM_USE_INTRINSICS)
361     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
362         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
363         long xf16 = rint(xf * 0x10000);
364         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
365 
366         int xoff = (xf16 >> 16) - 1;
367         int xclip = rsMax(0, xoff) - xoff;
368         int len = x2 - x1;
369 
370         int32_t yr[4];
371         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
372         mkYCoeff(yr, yf);
373 
374         xoff += xclip;
375 
376         rsdIntrinsicResizeB4_K(
377                 out, len,
378                 xf16 & 0xffff, xinc16,
379                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
380                 xclip, srcWidth - xoff + xclip,
381                 osc_ctl, yr);
382         out += len;
383         x1 += len;
384     }
385 #endif
386 
387     while(x1 < x2) {
388 #if defined(ARCH_X86_HAVE_AVX2)
389         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
390 #else
391         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
392 #endif
393         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
394         out++;
395         x1++;
396     }
397 }
398 
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)399 void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelDriverInfo *info,
400                                                 uint32_t xstart, uint32_t xend,
401                                                 uint32_t outstep) {
402     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
403 
404     if (!cp->mAlloc.get()) {
405         ALOGE("Resize executed without input, skipping");
406         return;
407     }
408     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
409     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
410     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
411     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
412 
413 
414 #if defined(ARCH_X86_HAVE_AVX2)
415     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
416 #else
417     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
418 #endif
419 
420     int starty = (int) floor(yf - 1);
421     yf = yf - floor(yf);
422     int maxy = srcHeight - 1;
423     int ys0 = rsMax(0, starty + 0);
424     int ys1 = rsMax(0, starty + 1);
425     int ys2 = rsMin(maxy, starty + 2);
426     int ys3 = rsMin(maxy, starty + 3);
427 
428     const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
429     const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
430     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
431     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
432 
433     uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
434     uint32_t x1 = xstart;
435     uint32_t x2 = xend;
436 
437 #if defined(ARCH_ARM_USE_INTRINSICS)
438     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
439         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
440         long xf16 = rint(xf * 0x10000);
441         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
442 
443         int xoff = (xf16 >> 16) - 1;
444         int xclip = rsMax(0, xoff) - xoff;
445         int len = x2 - x1;
446 
447         int32_t yr[4];
448         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
449         mkYCoeff(yr, yf);
450 
451         xoff += xclip;
452 
453         rsdIntrinsicResizeB2_K(
454                 out, len,
455                 xf16 & 0xffff, xinc16,
456                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
457                 xclip, srcWidth - xoff + xclip,
458                 osc_ctl, yr);
459         out += len;
460         x1 += len;
461     }
462 #endif
463 
464     while(x1 < x2) {
465 
466 #if defined(ARCH_X86_HAVE_AVX2)
467         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
468 #else
469         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
470 #endif
471         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
472         out++;
473         x1++;
474     }
475 }
476 
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)477 void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelDriverInfo *info,
478                                                 uint32_t xstart, uint32_t xend,
479                                                 uint32_t outstep) {
480     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
481 
482     if (!cp->mAlloc.get()) {
483         ALOGE("Resize executed without input, skipping");
484         return;
485     }
486     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
487     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
488     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
489     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
490 
491 
492 #if defined(ARCH_X86_HAVE_AVX2)
493     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
494 #else
495     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
496 #endif
497 
498     int starty = (int) floor(yf - 1);
499     yf = yf - floor(yf);
500     int maxy = srcHeight - 1;
501     int ys0 = rsMax(0, starty + 0);
502     int ys1 = rsMax(0, starty + 1);
503     int ys2 = rsMin(maxy, starty + 2);
504     int ys3 = rsMin(maxy, starty + 3);
505 
506     const uchar *yp0 = pin + stride * ys0;
507     const uchar *yp1 = pin + stride * ys1;
508     const uchar *yp2 = pin + stride * ys2;
509     const uchar *yp3 = pin + stride * ys3;
510 
511     uchar *out = ((uchar *)info->outPtr[0]) + xstart;
512     uint32_t x1 = xstart;
513     uint32_t x2 = xend;
514 
515 #if defined(ARCH_ARM_USE_INTRINSICS)
516     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
517         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
518         long xf16 = rint(xf * 0x10000);
519         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
520 
521         int xoff = (xf16 >> 16) - 1;
522         int xclip = rsMax(0, xoff) - xoff;
523         int len = x2 - x1;
524 
525         int32_t yr[4];
526         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
527         mkYCoeff(yr, yf);
528 
529         xoff += xclip;
530 
531         rsdIntrinsicResizeB1_K(
532                 out, len,
533                 xf16 & 0xffff, xinc16,
534                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
535                 xclip, srcWidth - xoff + xclip,
536                 osc_ctl, yr);
537         out += len;
538         x1 += len;
539     }
540 #endif
541 
542     while(x1 < x2) {
543 
544 #if defined(ARCH_X86_HAVE_AVX2)
545         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
546 #else
547         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
548 #endif
549 
550         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
551         out++;
552         x1++;
553     }
554 }
555 
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)556 void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelDriverInfo *info,
557                                                 uint32_t xstart, uint32_t xend,
558                                                 uint32_t outstep) {
559     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
560 
561     if (!cp->mAlloc.get()) {
562         ALOGE("Resize executed without input, skipping");
563         return;
564     }
565     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
566     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
567     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
568     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
569 
570 #if defined(ARCH_X86_HAVE_AVX2)
571     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
572 #else
573     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
574 #endif
575 
576     int starty = (int) floor(yf - 1);
577     yf = yf - floor(yf);
578     int maxy = srcHeight - 1;
579     int ys0 = rsMax(0, starty + 0);
580     int ys1 = rsMax(0, starty + 1);
581     int ys2 = rsMin(maxy, starty + 2);
582     int ys3 = rsMin(maxy, starty + 3);
583 
584     const float4 *yp0 = (const float4 *)(pin + stride * ys0);
585     const float4 *yp1 = (const float4 *)(pin + stride * ys1);
586     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
587     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
588 
589     float4 *out = ((float4 *)info->outPtr[0]) + xstart;
590     uint32_t x1 = xstart;
591     uint32_t x2 = xend;
592 
593     while(x1 < x2) {
594 
595 #if defined(ARCH_X86_HAVE_AVX2)
596         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
597 #else
598         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
599 #endif
600 
601         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
602         out++;
603         x1++;
604     }
605 }
606 
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)607 void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelDriverInfo *info,
608                                                 uint32_t xstart, uint32_t xend,
609                                                 uint32_t outstep) {
610     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
611 
612     if (!cp->mAlloc.get()) {
613         ALOGE("Resize executed without input, skipping");
614         return;
615     }
616     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
617     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
618     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
619     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
620 
621 
622 #if defined(ARCH_X86_HAVE_AVX2)
623     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
624 #else
625     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
626 #endif
627 
628     int starty = (int) floor(yf - 1);
629     yf = yf - floor(yf);
630     int maxy = srcHeight - 1;
631     int ys0 = rsMax(0, starty + 0);
632     int ys1 = rsMax(0, starty + 1);
633     int ys2 = rsMin(maxy, starty + 2);
634     int ys3 = rsMin(maxy, starty + 3);
635 
636     const float2 *yp0 = (const float2 *)(pin + stride * ys0);
637     const float2 *yp1 = (const float2 *)(pin + stride * ys1);
638     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
639     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
640 
641     float2 *out = ((float2 *)info->outPtr[0]) + xstart;
642     uint32_t x1 = xstart;
643     uint32_t x2 = xend;
644 
645     while(x1 < x2) {
646 
647 #if defined(ARCH_X86_HAVE_AVX2)
648         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
649 #else
650         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
651 #endif
652 
653         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
654         out++;
655         x1++;
656     }
657 }
658 
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)659 void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelDriverInfo *info,
660                                                 uint32_t xstart, uint32_t xend,
661                                                 uint32_t outstep) {
662     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
663 
664     if (!cp->mAlloc.get()) {
665         ALOGE("Resize executed without input, skipping");
666         return;
667     }
668     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
669     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
670     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
671     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
672 
673 
674 #if defined(ARCH_X86_HAVE_AVX2)
675     float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
676 #else
677     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
678 #endif
679 
680     int starty = (int) floor(yf - 1);
681     yf = yf - floor(yf);
682     int maxy = srcHeight - 1;
683     int ys0 = rsMax(0, starty + 0);
684     int ys1 = rsMax(0, starty + 1);
685     int ys2 = rsMin(maxy, starty + 2);
686     int ys3 = rsMin(maxy, starty + 3);
687 
688     const float *yp0 = (const float *)(pin + stride * ys0);
689     const float *yp1 = (const float *)(pin + stride * ys1);
690     const float *yp2 = (const float *)(pin + stride * ys2);
691     const float *yp3 = (const float *)(pin + stride * ys3);
692 
693     float *out = ((float *)info->outPtr[0]) + xstart;
694     uint32_t x1 = xstart;
695     uint32_t x2 = xend;
696 
697     while(x1 < x2) {
698 
699 #if defined(ARCH_X86_HAVE_AVX2)
700         float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
701 #else
702         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
703 #endif
704 
705         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
706         out++;
707         x1++;
708     }
709 }
710 
RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)711 RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
712             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
713             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_RESIZE) {
714 
715 }
716 
~RsdCpuScriptIntrinsicResize()717 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
718 }
719 
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)720 void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
721                                             const Allocation ** ains,
722                                             uint32_t inLen, Allocation * aout,
723                                             const void * usr, uint32_t usrLen,
724                                             const RsScriptCall *sc)
725 {
726     if (!mAlloc.get()) {
727         ALOGE("Resize executed without input, skipping");
728         return;
729     }
730     const uint32_t srcHeight = mAlloc->mHal.drvState.lod[0].dimY;
731     const uint32_t srcWidth = mAlloc->mHal.drvState.lod[0].dimX;
732 
733     //check the data type to determine F or U.
734     if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
735         switch(mAlloc->getType()->getElement()->getVectorSize()) {
736         case 1:
737             mRootPtr = &kernelU1;
738             break;
739         case 2:
740             mRootPtr = &kernelU2;
741             break;
742         case 3:
743         case 4:
744             mRootPtr = &kernelU4;
745             break;
746         }
747     } else {
748         switch(mAlloc->getType()->getElement()->getVectorSize()) {
749         case 1:
750             mRootPtr = &kernelF1;
751             break;
752         case 2:
753             mRootPtr = &kernelF2;
754             break;
755         case 3:
756         case 4:
757             mRootPtr = &kernelF4;
758             break;
759         }
760     }
761 
762     scaleX = (float)srcWidth / aout->mHal.drvState.lod[0].dimX;
763     scaleY = (float)srcHeight / aout->mHal.drvState.lod[0].dimY;
764 
765 }
766 
populateScript(Script * s)767 void RsdCpuScriptIntrinsicResize::populateScript(Script *s) {
768     s->mHal.info.exportedVariableCount = 1;
769 }
770 
invokeFreeChildren()771 void RsdCpuScriptIntrinsicResize::invokeFreeChildren() {
772     mAlloc.clear();
773 }
774 
rsdIntrinsic_Resize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)775 RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
776 
777     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
778 }
779 
780 } // namespace renderscript
781 } // namespace android
782