1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #if defined(ARCH_X86_HAVE_AVX2)
18 #include <stdint.h>
19 #include <x86intrin.h>
20 #include <xmmintrin.h>
21 #endif
22
23 #include "rsCpuIntrinsic.h"
24 #include "rsCpuIntrinsicInlines.h"
25
26 namespace android {
27 namespace renderscript {
28
29
30 class RsdCpuScriptIntrinsicResize : public RsdCpuScriptIntrinsic {
31 public:
32 void populateScript(Script *) override;
33 void invokeFreeChildren() override;
34
35 void setGlobalObj(uint32_t slot, ObjectBase *data) override;
36
37 ~RsdCpuScriptIntrinsicResize() override;
38 RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
39
40 void preLaunch(uint32_t slot, const Allocation ** ains,
41 uint32_t inLen, Allocation * aout, const void * usr,
42 uint32_t usrLen, const RsScriptCall *sc) override;
43
44 float scaleX;
45 float scaleY;
46
47 protected:
48 ObjectBaseRef<const Allocation> mAlloc;
49 ObjectBaseRef<const Element> mElement;
50
51 static void kernelU1(const RsExpandKernelDriverInfo *info,
52 uint32_t xstart, uint32_t xend,
53 uint32_t outstep);
54 static void kernelU2(const RsExpandKernelDriverInfo *info,
55 uint32_t xstart, uint32_t xend,
56 uint32_t outstep);
57 static void kernelU4(const RsExpandKernelDriverInfo *info,
58 uint32_t xstart, uint32_t xend,
59 uint32_t outstep);
60 static void kernelF1(const RsExpandKernelDriverInfo *info,
61 uint32_t xstart, uint32_t xend,
62 uint32_t outstep);
63 static void kernelF2(const RsExpandKernelDriverInfo *info,
64 uint32_t xstart, uint32_t xend,
65 uint32_t outstep);
66 static void kernelF4(const RsExpandKernelDriverInfo *info,
67 uint32_t xstart, uint32_t xend,
68 uint32_t outstep);
69 };
70
setGlobalObj(uint32_t slot,ObjectBase * data)71 void RsdCpuScriptIntrinsicResize::setGlobalObj(uint32_t slot, ObjectBase *data) {
72 rsAssert(slot == 0);
73 mAlloc.set(static_cast<Allocation *>(data));
74 }
75
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)76 static float4 cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3, float x) {
77 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
78 + x * (3.f * (p1 - p2) + p3 - p0)));
79 }
80
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)81 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
82 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
83 + x * (3.f * (p1 - p2) + p3 - p0)));
84 }
85
86
87 #if defined(ARCH_X86_HAVE_AVX2)
cubicInterpolate(float p0,float p1,float p2,float p3,float x)88 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
89 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
90 _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
91 + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),_mm_set1_ps(p3 - p0))))));
92
93 }
94 #else
cubicInterpolate(float p0,float p1,float p2,float p3,float x)95 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
96 return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
97 + x * (3.f * (p1 - p2) + p3 - p0)));
98 }
99 #endif
100
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)101 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
102 float xf, float yf, int width) {
103 int startx = (int) floor(xf - 1);
104 xf = xf - floor(xf);
105 int maxx = width - 1;
106 int xs0 = rsMax(0, startx + 0);
107 int xs1 = rsMax(0, startx + 1);
108 int xs2 = rsMin(maxx, startx + 2);
109 int xs3 = rsMin(maxx, startx + 3);
110
111 float4 p0 = cubicInterpolate(convert_float4(yp0[xs0]),
112 convert_float4(yp0[xs1]),
113 convert_float4(yp0[xs2]),
114 convert_float4(yp0[xs3]), xf);
115
116 float4 p1 = cubicInterpolate(convert_float4(yp1[xs0]),
117 convert_float4(yp1[xs1]),
118 convert_float4(yp1[xs2]),
119 convert_float4(yp1[xs3]), xf);
120
121 float4 p2 = cubicInterpolate(convert_float4(yp2[xs0]),
122 convert_float4(yp2[xs1]),
123 convert_float4(yp2[xs2]),
124 convert_float4(yp2[xs3]), xf);
125
126 float4 p3 = cubicInterpolate(convert_float4(yp3[xs0]),
127 convert_float4(yp3[xs1]),
128 convert_float4(yp3[xs2]),
129 convert_float4(yp3[xs3]), xf);
130
131 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
132 p = clamp(p + 0.5f, 0.f, 255.f);
133 return convert_uchar4(p);
134 }
135
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)136 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
137 float xf, float yf, int width) {
138 int startx = (int) floor(xf - 1);
139 xf = xf - floor(xf);
140 int maxx = width - 1;
141 int xs0 = rsMax(0, startx + 0);
142 int xs1 = rsMax(0, startx + 1);
143 int xs2 = rsMin(maxx, startx + 2);
144 int xs3 = rsMin(maxx, startx + 3);
145
146 float2 p0 = cubicInterpolate(convert_float2(yp0[xs0]),
147 convert_float2(yp0[xs1]),
148 convert_float2(yp0[xs2]),
149 convert_float2(yp0[xs3]), xf);
150
151 float2 p1 = cubicInterpolate(convert_float2(yp1[xs0]),
152 convert_float2(yp1[xs1]),
153 convert_float2(yp1[xs2]),
154 convert_float2(yp1[xs3]), xf);
155
156 float2 p2 = cubicInterpolate(convert_float2(yp2[xs0]),
157 convert_float2(yp2[xs1]),
158 convert_float2(yp2[xs2]),
159 convert_float2(yp2[xs3]), xf);
160
161 float2 p3 = cubicInterpolate(convert_float2(yp3[xs0]),
162 convert_float2(yp3[xs1]),
163 convert_float2(yp3[xs2]),
164 convert_float2(yp3[xs3]), xf);
165
166 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
167 p = clamp(p + 0.5f, 0.f, 255.f);
168 return convert_uchar2(p);
169 }
170
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)171 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
172 float xf, float yf, int width) {
173 int startx = (int) floor(xf - 1);
174 xf = xf - floor(xf);
175 int maxx = width - 1;
176 int xs0 = rsMax(0, startx + 0);
177 int xs1 = rsMax(0, startx + 1);
178 int xs2 = rsMin(maxx, startx + 2);
179 int xs3 = rsMin(maxx, startx + 3);
180
181 float p0 = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
182 (float)yp0[xs2], (float)yp0[xs3], xf);
183 float p1 = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
184 (float)yp1[xs2], (float)yp1[xs3], xf);
185 float p2 = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
186 (float)yp2[xs2], (float)yp2[xs3], xf);
187 float p3 = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
188 (float)yp3[xs2], (float)yp3[xs3], xf);
189
190 float p = cubicInterpolate(p0, p1, p2, p3, yf);
191 p = clamp(p + 0.5f, 0.f, 255.f);
192 return (uchar)p;
193 }
194
195 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
196
197 extern "C" void rsdIntrinsicResizeB4_K(
198 uchar4 *dst,
199 size_t count,
200 uint32_t xf,
201 uint32_t xinc,
202 uchar4 const *srcn,
203 uchar4 const *src0,
204 uchar4 const *src1,
205 uchar4 const *src2,
206 size_t xclip,
207 size_t avail,
208 uint64_t osc_ctl,
209 int32_t const *yr);
210
211 extern "C" void rsdIntrinsicResizeB2_K(
212 uchar2 *dst,
213 size_t count,
214 uint32_t xf,
215 uint32_t xinc,
216 uchar2 const *srcn,
217 uchar2 const *src0,
218 uchar2 const *src1,
219 uchar2 const *src2,
220 size_t xclip,
221 size_t avail,
222 uint64_t osc_ctl,
223 int32_t const *yr);
224
225 extern "C" void rsdIntrinsicResizeB1_K(
226 uchar *dst,
227 size_t count,
228 uint32_t xf,
229 uint32_t xinc,
230 uchar const *srcn,
231 uchar const *src0,
232 uchar const *src1,
233 uchar const *src2,
234 size_t xclip,
235 size_t avail,
236 uint64_t osc_ctl,
237 int32_t const *yr);
238
239 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)240 static void mkYCoeff(int32_t *yr, float yf) {
241 int32_t yf1 = rint(yf * 0x10000);
242 int32_t yf2 = rint(yf * yf * 0x10000);
243 int32_t yf3 = rint(yf * yf * yf * 0x10000);
244
245 yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
246 yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
247 yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
248 yr[3] = -(yf3 - yf2) >> 1;
249 }
250 #endif
251
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)252 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
253 float xf, float yf, int width) {
254 int startx = (int) floor(xf - 1);
255 xf = xf - floor(xf);
256 int maxx = width - 1;
257 int xs0 = rsMax(0, startx + 0);
258 int xs1 = rsMax(0, startx + 1);
259 int xs2 = rsMin(maxx, startx + 2);
260 int xs3 = rsMin(maxx, startx + 3);
261
262 float4 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
263 yp0[xs2], yp0[xs3], xf);
264 float4 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
265 yp1[xs2], yp1[xs3], xf);
266 float4 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
267 yp2[xs2], yp2[xs3], xf);
268 float4 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
269 yp3[xs2], yp3[xs3], xf);
270
271 float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
272 return p;
273 }
274
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)275 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
276 float xf, float yf, int width) {
277 int startx = (int) floor(xf - 1);
278 xf = xf - floor(xf);
279 int maxx = width - 1;
280 int xs0 = rsMax(0, startx + 0);
281 int xs1 = rsMax(0, startx + 1);
282 int xs2 = rsMin(maxx, startx + 2);
283 int xs3 = rsMin(maxx, startx + 3);
284
285 float2 p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
286 yp0[xs2], yp0[xs3], xf);
287 float2 p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
288 yp1[xs2], yp1[xs3], xf);
289 float2 p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
290 yp2[xs2], yp2[xs3], xf);
291 float2 p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
292 yp3[xs2], yp3[xs3], xf);
293
294 float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
295 return p;
296 }
297
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)298 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
299 float xf, float yf, int width) {
300 int startx = (int) floor(xf - 1);
301 xf = xf - floor(xf);
302 int maxx = width - 1;
303 int xs0 = rsMax(0, startx + 0);
304 int xs1 = rsMax(0, startx + 1);
305 int xs2 = rsMin(maxx, startx + 2);
306 int xs3 = rsMin(maxx, startx + 3);
307
308 float p0 = cubicInterpolate(yp0[xs0], yp0[xs1],
309 yp0[xs2], yp0[xs3], xf);
310 float p1 = cubicInterpolate(yp1[xs0], yp1[xs1],
311 yp1[xs2], yp1[xs3], xf);
312 float p2 = cubicInterpolate(yp2[xs0], yp2[xs1],
313 yp2[xs2], yp2[xs3], xf);
314 float p3 = cubicInterpolate(yp3[xs0], yp3[xs1],
315 yp3[xs2], yp3[xs3], xf);
316
317 float p = cubicInterpolate(p0, p1, p2, p3, yf);
318 return p;
319 }
320
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)321 void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelDriverInfo *info,
322 uint32_t xstart, uint32_t xend,
323 uint32_t outstep) {
324 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
325
326 if (!cp->mAlloc.get()) {
327 ALOGE("Resize executed without input, skipping");
328 return;
329 }
330 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
331 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
332 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
333 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
334
335
336 #if defined(ARCH_X86_HAVE_AVX2)
337 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
338 #else
339 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
340 #endif
341
342
343 int starty = (int) floor(yf - 1);
344 yf = yf - floor(yf);
345 int maxy = srcHeight - 1;
346 int ys0 = rsMax(0, starty + 0);
347 int ys1 = rsMax(0, starty + 1);
348 int ys2 = rsMin(maxy, starty + 2);
349 int ys3 = rsMin(maxy, starty + 3);
350
351 const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
352 const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
353 const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
354 const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
355
356 uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
357 uint32_t x1 = xstart;
358 uint32_t x2 = xend;
359
360 #if defined(ARCH_ARM_USE_INTRINSICS)
361 if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
362 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
363 long xf16 = rint(xf * 0x10000);
364 uint32_t xinc16 = rint(cp->scaleX * 0x10000);
365
366 int xoff = (xf16 >> 16) - 1;
367 int xclip = rsMax(0, xoff) - xoff;
368 int len = x2 - x1;
369
370 int32_t yr[4];
371 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
372 mkYCoeff(yr, yf);
373
374 xoff += xclip;
375
376 rsdIntrinsicResizeB4_K(
377 out, len,
378 xf16 & 0xffff, xinc16,
379 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
380 xclip, srcWidth - xoff + xclip,
381 osc_ctl, yr);
382 out += len;
383 x1 += len;
384 }
385 #endif
386
387 while(x1 < x2) {
388 #if defined(ARCH_X86_HAVE_AVX2)
389 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
390 #else
391 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
392 #endif
393 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
394 out++;
395 x1++;
396 }
397 }
398
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)399 void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelDriverInfo *info,
400 uint32_t xstart, uint32_t xend,
401 uint32_t outstep) {
402 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
403
404 if (!cp->mAlloc.get()) {
405 ALOGE("Resize executed without input, skipping");
406 return;
407 }
408 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
409 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
410 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
411 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
412
413
414 #if defined(ARCH_X86_HAVE_AVX2)
415 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
416 #else
417 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
418 #endif
419
420 int starty = (int) floor(yf - 1);
421 yf = yf - floor(yf);
422 int maxy = srcHeight - 1;
423 int ys0 = rsMax(0, starty + 0);
424 int ys1 = rsMax(0, starty + 1);
425 int ys2 = rsMin(maxy, starty + 2);
426 int ys3 = rsMin(maxy, starty + 3);
427
428 const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
429 const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
430 const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
431 const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
432
433 uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
434 uint32_t x1 = xstart;
435 uint32_t x2 = xend;
436
437 #if defined(ARCH_ARM_USE_INTRINSICS)
438 if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
439 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
440 long xf16 = rint(xf * 0x10000);
441 uint32_t xinc16 = rint(cp->scaleX * 0x10000);
442
443 int xoff = (xf16 >> 16) - 1;
444 int xclip = rsMax(0, xoff) - xoff;
445 int len = x2 - x1;
446
447 int32_t yr[4];
448 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
449 mkYCoeff(yr, yf);
450
451 xoff += xclip;
452
453 rsdIntrinsicResizeB2_K(
454 out, len,
455 xf16 & 0xffff, xinc16,
456 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
457 xclip, srcWidth - xoff + xclip,
458 osc_ctl, yr);
459 out += len;
460 x1 += len;
461 }
462 #endif
463
464 while(x1 < x2) {
465
466 #if defined(ARCH_X86_HAVE_AVX2)
467 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
468 #else
469 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
470 #endif
471 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
472 out++;
473 x1++;
474 }
475 }
476
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)477 void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelDriverInfo *info,
478 uint32_t xstart, uint32_t xend,
479 uint32_t outstep) {
480 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
481
482 if (!cp->mAlloc.get()) {
483 ALOGE("Resize executed without input, skipping");
484 return;
485 }
486 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
487 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
488 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
489 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
490
491
492 #if defined(ARCH_X86_HAVE_AVX2)
493 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
494 #else
495 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
496 #endif
497
498 int starty = (int) floor(yf - 1);
499 yf = yf - floor(yf);
500 int maxy = srcHeight - 1;
501 int ys0 = rsMax(0, starty + 0);
502 int ys1 = rsMax(0, starty + 1);
503 int ys2 = rsMin(maxy, starty + 2);
504 int ys3 = rsMin(maxy, starty + 3);
505
506 const uchar *yp0 = pin + stride * ys0;
507 const uchar *yp1 = pin + stride * ys1;
508 const uchar *yp2 = pin + stride * ys2;
509 const uchar *yp3 = pin + stride * ys3;
510
511 uchar *out = ((uchar *)info->outPtr[0]) + xstart;
512 uint32_t x1 = xstart;
513 uint32_t x2 = xend;
514
515 #if defined(ARCH_ARM_USE_INTRINSICS)
516 if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
517 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
518 long xf16 = rint(xf * 0x10000);
519 uint32_t xinc16 = rint(cp->scaleX * 0x10000);
520
521 int xoff = (xf16 >> 16) - 1;
522 int xclip = rsMax(0, xoff) - xoff;
523 int len = x2 - x1;
524
525 int32_t yr[4];
526 uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
527 mkYCoeff(yr, yf);
528
529 xoff += xclip;
530
531 rsdIntrinsicResizeB1_K(
532 out, len,
533 xf16 & 0xffff, xinc16,
534 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
535 xclip, srcWidth - xoff + xclip,
536 osc_ctl, yr);
537 out += len;
538 x1 += len;
539 }
540 #endif
541
542 while(x1 < x2) {
543
544 #if defined(ARCH_X86_HAVE_AVX2)
545 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
546 #else
547 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
548 #endif
549
550 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
551 out++;
552 x1++;
553 }
554 }
555
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)556 void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelDriverInfo *info,
557 uint32_t xstart, uint32_t xend,
558 uint32_t outstep) {
559 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
560
561 if (!cp->mAlloc.get()) {
562 ALOGE("Resize executed without input, skipping");
563 return;
564 }
565 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
566 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
567 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
568 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
569
570 #if defined(ARCH_X86_HAVE_AVX2)
571 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
572 #else
573 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
574 #endif
575
576 int starty = (int) floor(yf - 1);
577 yf = yf - floor(yf);
578 int maxy = srcHeight - 1;
579 int ys0 = rsMax(0, starty + 0);
580 int ys1 = rsMax(0, starty + 1);
581 int ys2 = rsMin(maxy, starty + 2);
582 int ys3 = rsMin(maxy, starty + 3);
583
584 const float4 *yp0 = (const float4 *)(pin + stride * ys0);
585 const float4 *yp1 = (const float4 *)(pin + stride * ys1);
586 const float4 *yp2 = (const float4 *)(pin + stride * ys2);
587 const float4 *yp3 = (const float4 *)(pin + stride * ys3);
588
589 float4 *out = ((float4 *)info->outPtr[0]) + xstart;
590 uint32_t x1 = xstart;
591 uint32_t x2 = xend;
592
593 while(x1 < x2) {
594
595 #if defined(ARCH_X86_HAVE_AVX2)
596 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
597 #else
598 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
599 #endif
600
601 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
602 out++;
603 x1++;
604 }
605 }
606
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)607 void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelDriverInfo *info,
608 uint32_t xstart, uint32_t xend,
609 uint32_t outstep) {
610 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
611
612 if (!cp->mAlloc.get()) {
613 ALOGE("Resize executed without input, skipping");
614 return;
615 }
616 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
617 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
618 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
619 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
620
621
622 #if defined(ARCH_X86_HAVE_AVX2)
623 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
624 #else
625 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
626 #endif
627
628 int starty = (int) floor(yf - 1);
629 yf = yf - floor(yf);
630 int maxy = srcHeight - 1;
631 int ys0 = rsMax(0, starty + 0);
632 int ys1 = rsMax(0, starty + 1);
633 int ys2 = rsMin(maxy, starty + 2);
634 int ys3 = rsMin(maxy, starty + 3);
635
636 const float2 *yp0 = (const float2 *)(pin + stride * ys0);
637 const float2 *yp1 = (const float2 *)(pin + stride * ys1);
638 const float2 *yp2 = (const float2 *)(pin + stride * ys2);
639 const float2 *yp3 = (const float2 *)(pin + stride * ys3);
640
641 float2 *out = ((float2 *)info->outPtr[0]) + xstart;
642 uint32_t x1 = xstart;
643 uint32_t x2 = xend;
644
645 while(x1 < x2) {
646
647 #if defined(ARCH_X86_HAVE_AVX2)
648 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
649 #else
650 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
651 #endif
652
653 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
654 out++;
655 x1++;
656 }
657 }
658
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)659 void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelDriverInfo *info,
660 uint32_t xstart, uint32_t xend,
661 uint32_t outstep) {
662 RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
663
664 if (!cp->mAlloc.get()) {
665 ALOGE("Resize executed without input, skipping");
666 return;
667 }
668 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
669 const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
670 const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
671 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
672
673
674 #if defined(ARCH_X86_HAVE_AVX2)
675 float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(info->current.y + 0.5f),_mm_set1_ps(cp->scaleY), _mm_set1_ps(0.5f)));
676 #else
677 float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
678 #endif
679
680 int starty = (int) floor(yf - 1);
681 yf = yf - floor(yf);
682 int maxy = srcHeight - 1;
683 int ys0 = rsMax(0, starty + 0);
684 int ys1 = rsMax(0, starty + 1);
685 int ys2 = rsMin(maxy, starty + 2);
686 int ys3 = rsMin(maxy, starty + 3);
687
688 const float *yp0 = (const float *)(pin + stride * ys0);
689 const float *yp1 = (const float *)(pin + stride * ys1);
690 const float *yp2 = (const float *)(pin + stride * ys2);
691 const float *yp3 = (const float *)(pin + stride * ys3);
692
693 float *out = ((float *)info->outPtr[0]) + xstart;
694 uint32_t x1 = xstart;
695 uint32_t x2 = xend;
696
697 while(x1 < x2) {
698
699 #if defined(ARCH_X86_HAVE_AVX2)
700 float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(cp->scaleX) , _mm_set1_ps(0.5f)));
701 #else
702 float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
703 #endif
704
705 *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
706 out++;
707 x1++;
708 }
709 }
710
RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)711 RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
712 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
713 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_RESIZE) {
714
715 }
716
~RsdCpuScriptIntrinsicResize()717 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
718 }
719
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)720 void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
721 const Allocation ** ains,
722 uint32_t inLen, Allocation * aout,
723 const void * usr, uint32_t usrLen,
724 const RsScriptCall *sc)
725 {
726 if (!mAlloc.get()) {
727 ALOGE("Resize executed without input, skipping");
728 return;
729 }
730 const uint32_t srcHeight = mAlloc->mHal.drvState.lod[0].dimY;
731 const uint32_t srcWidth = mAlloc->mHal.drvState.lod[0].dimX;
732
733 //check the data type to determine F or U.
734 if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
735 switch(mAlloc->getType()->getElement()->getVectorSize()) {
736 case 1:
737 mRootPtr = &kernelU1;
738 break;
739 case 2:
740 mRootPtr = &kernelU2;
741 break;
742 case 3:
743 case 4:
744 mRootPtr = &kernelU4;
745 break;
746 }
747 } else {
748 switch(mAlloc->getType()->getElement()->getVectorSize()) {
749 case 1:
750 mRootPtr = &kernelF1;
751 break;
752 case 2:
753 mRootPtr = &kernelF2;
754 break;
755 case 3:
756 case 4:
757 mRootPtr = &kernelF4;
758 break;
759 }
760 }
761
762 scaleX = (float)srcWidth / aout->mHal.drvState.lod[0].dimX;
763 scaleY = (float)srcHeight / aout->mHal.drvState.lod[0].dimY;
764
765 }
766
populateScript(Script * s)767 void RsdCpuScriptIntrinsicResize::populateScript(Script *s) {
768 s->mHal.info.exportedVariableCount = 1;
769 }
770
invokeFreeChildren()771 void RsdCpuScriptIntrinsicResize::invokeFreeChildren() {
772 mAlloc.clear();
773 }
774
rsdIntrinsic_Resize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)775 RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
776
777 return new RsdCpuScriptIntrinsicResize(ctx, s, e);
778 }
779
780 } // namespace renderscript
781 } // namespace android
782