1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdint.h>
18 #include <x86intrin.h>
19 
20 /* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
cvtepu8_epi32(__m128i x)21 static inline __m128i cvtepu8_epi32(__m128i x) {
22 #if defined(__SSE4_1__)
23     return _mm_cvtepu8_epi32(x);
24 #elif defined(__SSSE3__)
25     const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
26     x = _mm_shuffle_epi8(x, M8to32);
27     return x;
28 #else
29 #   error "Require at least SSSE3"
30 #endif
31 }
32 
packus_epi32(__m128i lo,__m128i hi)33 static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
34 #if defined(__SSE4_1__)
35     return _mm_packus_epi32(lo, hi);
36 #elif defined(__SSSE3__)
37     const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
38     const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
39     const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
40     const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
41     lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
42     lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
43     hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
44     hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
45     return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
46                         _mm_shuffle_epi8(hi, M32to16H));
47 #else
48 #   error "Require at least SSSE3"
49 #endif
50 }
51 
mullo_epi32(__m128i x,__m128i y)52 static inline __m128i mullo_epi32(__m128i x, __m128i y) {
53 #if defined(__SSE4_1__)
54     return _mm_mullo_epi32(x, y);
55 #elif defined(__SSSE3__)
56     const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
57     __m128i even = _mm_mul_epu32(x, y);
58     __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
59                                 _mm_srli_si128(y, 4));
60     even = _mm_and_si128(even, Meven);
61     odd = _mm_and_si128(odd, Meven);
62     return _mm_or_si128(even, _mm_slli_si128(odd, 4));
63 #else
64 #   error "Require at least SSSE3"
65 #endif
66 }
67 
68 /* 'mask' must packed 8-bit of 0x00 or 0xff */
blendv_epi8(__m128i x,__m128i y,__m128i mask)69 static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
70 #if defined(__SSE4_1__)
71     return _mm_blendv_epi8(x, y, mask);
72 #elif defined(__SSSE3__)
73     return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
74 #else
75 #   error "Require at least SSSE3"
76 #endif
77 }
78 
rsdIntrinsicConvolve3x3_K(void * dst,const void * y0,const void * y1,const void * y2,const short * coef,uint32_t count)79 extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
80                                           const void *y1, const void *y2,
81                                           const short *coef, uint32_t count) {
82     __m128i x;
83     __m128i c0, c2, c4, c6, c8;
84     __m128i r0, r1, r2;
85     __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
86     __m128i o0, o1;
87     uint32_t i;
88 
89     x = _mm_loadl_epi64((const __m128i *)(coef+0));
90     c0 = _mm_shuffle_epi32(x, 0x00);
91     c2 = _mm_shuffle_epi32(x, 0x55);
92     x = _mm_loadl_epi64((const __m128i *)(coef+4));
93     c4 = _mm_shuffle_epi32(x, 0x00);
94     c6 = _mm_shuffle_epi32(x, 0x55);
95     x = _mm_loadl_epi64((const __m128i *)(coef+8));
96     c8 = _mm_shuffle_epi32(x, 0x00);
97 
98     for (i = 0; i < count; ++i) {
99 
100         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
101         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
102         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
103         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
104         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
105         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
106         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
107         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
108         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
109         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
110         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
111         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
112 
113         o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
114         o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
115 
116         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
117         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
118 
119         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
120         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
121 
122         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
123         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
124 
125         o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
126         o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
127 
128         o0 = _mm_srai_epi32(o0, 8);
129         o1 = _mm_srai_epi32(o1, 8);
130 
131         o0 = packus_epi32(o0, o1);
132         o0 = _mm_packus_epi16(o0, o0);
133         _mm_storel_epi64((__m128i *)dst, o0);
134 
135         y0 = (const char *)y0 + 8;
136         y1 = (const char *)y1 + 8;
137         y2 = (const char *)y2 + 8;
138         dst = (char *)dst + 8;
139     }
140 }
141 
rsdIntrinsicColorMatrix4x4_K(void * dst,const void * src,const short * coef,uint32_t count)142 void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
143                                   const short *coef, uint32_t count) {
144     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
145                                       14, 10, 6, 2,
146                                       13,  9, 5, 1,
147                                       12,  8, 4, 0);
148 
149     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
150     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
151     __m128i c0, c1, c2, c3;
152     __m128i i4, o4;
153     __m128i xy, zw;
154     __m128i x2, y2, z2, w2;
155     uint32_t i;
156 
157     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
158     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
159     c0 = _mm_unpacklo_epi16(c0, c1);
160 
161     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
162     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
163     c2 = _mm_unpacklo_epi16(c2, c3);
164 
165     for (i = 0; i < count; ++i) {
166         i4 = _mm_load_si128((const __m128i *)src);
167         xy = _mm_shuffle_epi8(i4, Mxy);
168         zw = _mm_shuffle_epi8(i4, Mzw);
169 
170         x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
171         y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
172         z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
173         w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
174 
175         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
176         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
177         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
178         w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
179 
180         x2 = _mm_srai_epi32(x2, 8);
181         y2 = _mm_srai_epi32(y2, 8);
182         z2 = _mm_srai_epi32(z2, 8);
183         w2 = _mm_srai_epi32(w2, 8);
184 
185         x2 = packus_epi32(x2, y2);
186         z2 = packus_epi32(z2, w2);
187         o4 = _mm_packus_epi16(x2, z2);
188 
189         o4 = _mm_shuffle_epi8(o4, T4x4);
190         _mm_storeu_si128((__m128i *)dst, o4);
191 
192         src = (const char *)src + 16;
193         dst = (char *)dst + 16;
194     }
195 }
196 
rsdIntrinsicColorMatrix3x3_K(void * dst,const void * src,const short * coef,uint32_t count)197 void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
198                                   const short *coef, uint32_t count) {
199     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
200                                       14, 10, 6, 2,
201                                       13,  9, 5, 1,
202                                       12,  8, 4, 0);
203 
204     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
205     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
206 
207     __m128i c0, c1, c2, c3;
208     __m128i i4, o4;
209     __m128i xy, zw;
210     __m128i x2, y2, z2, w2;
211     uint32_t i;
212 
213     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
214     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
215     c0 = _mm_unpacklo_epi16(c0, c1);
216 
217     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
218     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
219     c2 = _mm_unpacklo_epi16(c2, c3);
220 
221     for (i = 0; i < count; ++i) {
222         i4 = _mm_loadu_si128((const __m128i *)src);
223         xy = _mm_shuffle_epi8(i4, Mxy);
224         zw = _mm_shuffle_epi8(i4, Mzw);
225 
226         x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
227         y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
228         z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
229 
230         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
231         y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
232         z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
233 
234         x2 = _mm_srai_epi32(x2, 8);
235         y2 = _mm_srai_epi32(y2, 8);
236         z2 = _mm_srai_epi32(z2, 8);
237         w2 = _mm_srli_epi32(zw, 16);
238 
239         x2 = packus_epi32(x2, y2);
240         z2 = packus_epi32(z2, w2);
241         o4 = _mm_packus_epi16(x2, z2);
242 
243         o4 = _mm_shuffle_epi8(o4, T4x4);
244         _mm_storeu_si128((__m128i *)dst, o4);
245 
246         src = (const char *)src + 16;
247         dst = (char *)dst + 16;
248     }
249 }
250 
rsdIntrinsicColorMatrixDot_K(void * dst,const void * src,const short * coef,uint32_t count)251 void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
252                                   const short *coef, uint32_t count) {
253     const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
254                                       14, 10, 6, 2,
255                                       13,  9, 5, 1,
256                                       12,  8, 4, 0);
257     const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
258     const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
259     __m128i c0, c1, c2, c3;
260     __m128i i4, o4;
261     __m128i xy, zw;
262     __m128i x2, y2, z2, w2;
263     uint32_t i;
264 
265     c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
266     c0 = _mm_shufflelo_epi16(c0, 0);
267     c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
268     c1 = _mm_shufflelo_epi16(c1, 0);
269     c0 = _mm_unpacklo_epi16(c0, c1);
270 
271     c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
272     c2 = _mm_shufflelo_epi16(c2, 0);
273     c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
274     c3 = _mm_shufflelo_epi16(c3, 0);
275     c2 = _mm_unpacklo_epi16(c2, c3);
276 
277     for (i = 0; i < count; ++i) {
278         i4 = _mm_loadu_si128((const __m128i *)src);
279 
280         xy = _mm_shuffle_epi8(i4, Mxy);
281         zw = _mm_shuffle_epi8(i4, Mzw);
282 
283         x2 =  _mm_madd_epi16(xy, c0);
284         x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
285 
286         x2 = _mm_srai_epi32(x2, 8);
287         y2 = x2;
288         z2 = x2;
289         w2 = _mm_srli_epi32(zw, 16);
290 
291         x2 = packus_epi32(x2, y2);
292         z2 = packus_epi32(z2, w2);
293         o4 = _mm_packus_epi16(x2, z2);
294 
295         o4 = _mm_shuffle_epi8(o4, T4x4);
296         _mm_storeu_si128((__m128i *)dst, o4);
297 
298         src = (const char *)src + 16;
299         dst = (char *)dst + 16;
300     }
301 }
302 
rsdIntrinsicBlurVFU4_K(void * dst,const void * pin,int stride,const void * gptr,int rct,int x1,int x2)303 void rsdIntrinsicBlurVFU4_K(void *dst,
304                           const void *pin, int stride, const void *gptr,
305                           int rct, int x1, int x2) {
306     const char *pi;
307     __m128i pi0, pi1;
308     __m128 pf0, pf1;
309     __m128 bp0, bp1;
310     __m128 x;
311     int r;
312 
313     for (; x1 < x2; x1 += 2) {
314         pi = (const char *)pin + (x1 << 2);
315         bp0 = _mm_setzero_ps();
316         bp1 = _mm_setzero_ps();
317 
318         for (r = 0; r < rct; ++r) {
319             x = _mm_load_ss((const float *)gptr + r);
320             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
321 
322             pi0 = _mm_cvtsi32_si128(*(const int *)pi);
323             pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
324 
325             pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
326             pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
327 
328             bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
329             bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
330 
331             pi += stride;
332         }
333 
334         _mm_storeu_ps((float *)dst, bp0);
335         _mm_storeu_ps((float *)dst + 4, bp1);
336         dst = (char *)dst + 32;
337     }
338 }
339 
rsdIntrinsicBlurHFU4_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)340 void rsdIntrinsicBlurHFU4_K(void *dst,
341                           const void *pin, const void *gptr,
342                           int rct, int x1, int x2) {
343     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
344     const float *pi;
345     __m128 pf, x, y;
346     __m128i o;
347     int r;
348 
349     for (; x1 < x2; ++x1) {
350         /* rct is define as 2*r+1 by the caller */
351         x = _mm_load_ss((const float *)gptr);
352         x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
353 
354         pi = (const float *)pin + (x1 << 2);
355         pf = _mm_mul_ps(x, _mm_load_ps(pi));
356 
357         for (r = 1; r < rct; r += 2) {
358             x = _mm_load_ss((const float *)gptr + r);
359             y = _mm_load_ss((const float *)gptr + r + 1);
360             x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
361             y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
362 
363             pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
364             pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
365         }
366 
367         o = _mm_cvtps_epi32(pf);
368         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
369         dst = (char *)dst + 4;
370     }
371 }
372 
rsdIntrinsicBlurHFU1_K(void * dst,const void * pin,const void * gptr,int rct,int x1,int x2)373 void rsdIntrinsicBlurHFU1_K(void *dst,
374                           const void *pin, const void *gptr,
375                           int rct, int x1, int x2) {
376     const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
377     const float *pi;
378     __m128 pf, g0, g1, g2, g3, gx, p0, p1;
379     __m128i o;
380     int r;
381 
382     for (; x1 < x2; x1+=4) {
383         g0 = _mm_load_ss((const float *)gptr);
384         g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
385 
386         pi = (const float *)pin + x1;
387         pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
388 
389         for (r = 1; r < rct; r += 4) {
390             gx = _mm_loadu_ps((const float *)gptr + r);
391             p0 = _mm_loadu_ps(pi + r);
392             p1 = _mm_loadu_ps(pi + r + 4);
393 
394             g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
395             pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
396             g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
397             pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
398             g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
399             pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
400             g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
401             pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
402         }
403 
404         o = _mm_cvtps_epi32(pf);
405         *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
406         dst = (char *)dst + 4;
407     }
408 }
409 
rsdIntrinsicYuv_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)410 void rsdIntrinsicYuv_K(void *dst,
411                        const unsigned char *pY, const unsigned char *pUV,
412                        uint32_t count, const short *param) {
413     __m128i biasY, biasUV;
414     __m128i c0, c1, c2, c3, c4;
415 
416     biasY = _mm_set1_epi32(param[8]);   /*  16 */
417     biasUV = _mm_set1_epi32(param[16]); /* 128 */
418 
419     c0 = _mm_set1_epi32(param[0]);  /*  298 */
420     c1 = _mm_set1_epi32(param[1]);  /*  409 */
421     c2 = _mm_set1_epi32(param[2]);  /* -100 */
422     c3 = _mm_set1_epi32(param[3]);  /*  516 */
423     c4 = _mm_set1_epi32(param[4]);  /* -208 */
424 
425     __m128i Y, UV, U, V, R, G, B, A;
426 
427     A = _mm_set1_epi32(255);
428     uint32_t i;
429 
430     for (i = 0; i < (count << 1); ++i) {
431         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
432         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
433 
434         Y = _mm_sub_epi32(Y, biasY);
435         UV = _mm_sub_epi32(UV, biasUV);
436 
437         U = _mm_shuffle_epi32(UV, 0xf5);
438         V = _mm_shuffle_epi32(UV, 0xa0);
439 
440         Y = mullo_epi32(Y, c0);
441 
442         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
443         R = _mm_add_epi32(R, biasUV);
444         R = _mm_srai_epi32(R, 8);
445 
446         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
447         G = _mm_add_epi32(G, mullo_epi32(V, c4));
448         G = _mm_add_epi32(G, biasUV);
449         G = _mm_srai_epi32(G, 8);
450 
451         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
452         B = _mm_add_epi32(B, biasUV);
453         B = _mm_srai_epi32(B, 8);
454 
455         __m128i y1, y2, y3, y4;
456 
457         y1 = packus_epi32(R, G);
458         y2 = packus_epi32(B, A);
459         y3 = _mm_packus_epi16(y1, y2);
460         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
461                                           14, 10, 6, 2,
462                                           13,  9, 5, 1,
463                                           12,  8, 4, 0);
464         y4 = _mm_shuffle_epi8(y3, T4x4);
465         _mm_storeu_si128((__m128i *)dst, y4);
466         pY += 4;
467         pUV += 4;
468         dst = (__m128i *)dst + 1;
469     }
470 }
471 
rsdIntrinsicYuvR_K(void * dst,const unsigned char * pY,const unsigned char * pUV,uint32_t count,const short * param)472 void rsdIntrinsicYuvR_K(void *dst,
473                        const unsigned char *pY, const unsigned char *pUV,
474                        uint32_t count, const short *param) {
475     __m128i biasY, biasUV;
476     __m128i c0, c1, c2, c3, c4;
477 
478     biasY = _mm_set1_epi32(param[8]);   /*  16 */
479     biasUV = _mm_set1_epi32(param[16]); /* 128 */
480 
481     c0 = _mm_set1_epi32(param[0]);  /*  298 */
482     c1 = _mm_set1_epi32(param[1]);  /*  409 */
483     c2 = _mm_set1_epi32(param[2]);  /* -100 */
484     c3 = _mm_set1_epi32(param[3]);  /*  516 */
485     c4 = _mm_set1_epi32(param[4]);  /* -208 */
486 
487     __m128i Y, UV, U, V, R, G, B, A;
488 
489     A = _mm_set1_epi32(255);
490     uint32_t i;
491 
492     for (i = 0; i < (count << 1); ++i) {
493         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
494         UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
495 
496         Y = _mm_sub_epi32(Y, biasY);
497         UV = _mm_sub_epi32(UV, biasUV);
498 
499         V = _mm_shuffle_epi32(UV, 0xf5);
500         U = _mm_shuffle_epi32(UV, 0xa0);
501 
502         Y = mullo_epi32(Y, c0);
503 
504         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
505         R = _mm_add_epi32(R, biasUV);
506         R = _mm_srai_epi32(R, 8);
507 
508         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
509         G = _mm_add_epi32(G, mullo_epi32(V, c4));
510         G = _mm_add_epi32(G, biasUV);
511         G = _mm_srai_epi32(G, 8);
512 
513         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
514         B = _mm_add_epi32(B, biasUV);
515         B = _mm_srai_epi32(B, 8);
516 
517         __m128i y1, y2, y3, y4;
518 
519         y1 = packus_epi32(R, G);
520         y2 = packus_epi32(B, A);
521         y3 = _mm_packus_epi16(y1, y2);
522         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
523                                           14, 10, 6, 2,
524                                           13,  9, 5, 1,
525                                           12,  8, 4, 0);
526         y4 = _mm_shuffle_epi8(y3, T4x4);
527         _mm_storeu_si128((__m128i *)dst, y4);
528         pY += 4;
529         pUV += 4;
530         dst = (__m128i *)dst + 1;
531     }
532 }
533 
rsdIntrinsicYuv2_K(void * dst,const unsigned char * pY,const unsigned char * pU,const unsigned char * pV,uint32_t count,const short * param)534 void rsdIntrinsicYuv2_K(void *dst,
535                        const unsigned char *pY, const unsigned char *pU,
536                        const unsigned char *pV, uint32_t count, const short *param) {
537     __m128i biasY, biasUV;
538     __m128i c0, c1, c2, c3, c4;
539 
540     biasY = _mm_set1_epi32(param[8]);   /*  16 */
541     biasUV = _mm_set1_epi32(param[16]); /* 128 */
542 
543     c0 = _mm_set1_epi32(param[0]);  /*  298 */
544     c1 = _mm_set1_epi32(param[1]);  /*  409 */
545     c2 = _mm_set1_epi32(param[2]);  /* -100 */
546     c3 = _mm_set1_epi32(param[3]);  /*  516 */
547     c4 = _mm_set1_epi32(param[4]);  /* -208 */
548 
549     __m128i Y, U, V, R, G, B, A;
550 
551     A = _mm_set1_epi32(255);
552     uint32_t i;
553 
554     for (i = 0; i < (count << 1); ++i) {
555         Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
556         U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
557 		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
558 
559         Y = _mm_sub_epi32(Y, biasY);
560         U = _mm_sub_epi32(U, biasUV);
561 		V = _mm_sub_epi32(V, biasUV);
562 
563         Y = mullo_epi32(Y, c0);
564 
565         R = _mm_add_epi32(Y, mullo_epi32(V, c1));
566         R = _mm_add_epi32(R, biasUV);
567         R = _mm_srai_epi32(R, 8);
568 
569         G = _mm_add_epi32(Y, mullo_epi32(U, c2));
570         G = _mm_add_epi32(G, mullo_epi32(V, c4));
571         G = _mm_add_epi32(G, biasUV);
572         G = _mm_srai_epi32(G, 8);
573 
574         B = _mm_add_epi32(Y, mullo_epi32(U, c3));
575         B = _mm_add_epi32(B, biasUV);
576         B = _mm_srai_epi32(B, 8);
577 
578         __m128i y1, y2, y3, y4;
579 
580         y1 = packus_epi32(R, G);
581         y2 = packus_epi32(B, A);
582         y3 = _mm_packus_epi16(y1, y2);
583         const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
584                                           14, 10, 6, 2,
585                                           13,  9, 5, 1,
586                                           12,  8, 4, 0);
587         y4 = _mm_shuffle_epi8(y3, T4x4);
588         _mm_storeu_si128((__m128i *)dst, y4);
589         pY += 4;
590         pU += 4;
591 		pV += 4;
592         dst = (__m128i *)dst + 1;
593     }
594 }
595 
rsdIntrinsicConvolve5x5_K(void * dst,const void * y0,const void * y1,const void * y2,const void * y3,const void * y4,const short * coef,uint32_t count)596 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
597                                           const void *y1, const void *y2,
598                                           const void *y3, const void *y4,
599                                           const short *coef, uint32_t count) {
600     __m128i x;
601     __m128i c0, c2, c4, c6, c8, c10, c12;
602     __m128i c14, c16, c18, c20, c22, c24;
603     __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
604     __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
605     __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
606     __m128i p16, p17, p18, p19, p20, p21, p22, p23;
607     __m128i p24, p25, p26, p27, p28, p29, p30, p31;
608     __m128i p32, p33, p34, p35, p36, p37, p38, p39;
609     __m128i o0, o1, o2, o3;
610     uint32_t i;
611 
612     x = _mm_loadl_epi64((const __m128i *)(coef+0));
613     c0  = _mm_shuffle_epi32(x, 0x00);
614     c2  = _mm_shuffle_epi32(x, 0x55);
615 
616     x = _mm_loadl_epi64((const __m128i *)(coef+4));
617     c4  = _mm_shuffle_epi32(x, 0x00);
618     c6  = _mm_shuffle_epi32(x, 0x55);
619 
620     x = _mm_loadl_epi64((const __m128i *)(coef+8));
621     c8  = _mm_shuffle_epi32(x, 0x00);
622     c10  = _mm_shuffle_epi32(x, 0x55);
623 
624     x = _mm_loadl_epi64((const __m128i *)(coef+12));
625     c12  = _mm_shuffle_epi32(x, 0x00);
626     c14  = _mm_shuffle_epi32(x, 0x55);
627 
628     x = _mm_loadl_epi64((const __m128i *)(coef+16));
629     c16  = _mm_shuffle_epi32(x, 0x00);
630     c18  = _mm_shuffle_epi32(x, 0x55);
631 
632     x = _mm_loadl_epi64((const __m128i *)(coef+20));
633     c20  = _mm_shuffle_epi32(x, 0x00);
634     c22  = _mm_shuffle_epi32(x, 0x55);
635 
636     x = _mm_loadl_epi64((const __m128i *)(coef+24));
637     c24  = _mm_shuffle_epi32(x, 0x00);
638 
639     for (i = 0; i < count; ++i) {
640 
641         p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
642         p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
643         p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
644         p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
645         p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
646         p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
647         p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
648         p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
649 
650         p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
651         p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
652         p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
653         p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
654         p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
655         p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
656         p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
657         p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
658 
659         p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
660         p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
661         p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
662         p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
663         p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
664         p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
665         p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
666         p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
667 
668         p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
669         p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
670         p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
671         p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
672         p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
673         p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
674         p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
675         p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
676 
677         p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
678         p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
679         p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
680         p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
681         p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
682         p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
683         p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
684         p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
685 
686         o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
687         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
688         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
689         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
690         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
691         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
692         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
693         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
694         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
695         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
696         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
697         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
698         o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
699         o0 = _mm_srai_epi32(o0, 8);
700 
701         o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
702         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
703         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
704         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
705         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
706         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
707         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
708         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
709         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
710         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
711         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
712         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
713         o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
714         o1 = _mm_srai_epi32(o1, 8);
715 
716         o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
717         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
718         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
719         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
720         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
721         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
722         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
723         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
724         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
725         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
726         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
727         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
728         o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
729         o2 = _mm_srai_epi32(o2, 8);
730 
731         o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
732         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
733         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
734         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
735         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
736         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
737         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
738         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
739         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
740         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
741         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
742         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
743         o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
744         o3 = _mm_srai_epi32(o3, 8);
745 
746         o0 = packus_epi32(o0, o1);
747         o2 = packus_epi32(o2, o3);
748         o0 = _mm_packus_epi16(o0, o2);
749         _mm_storeu_si128((__m128i *)dst, o0);
750 
751         y0 = (const char *)y0 + 16;
752         y1 = (const char *)y1 + 16;
753         y2 = (const char *)y2 + 16;
754         y3 = (const char *)y3 + 16;
755         y4 = (const char *)y4 + 16;
756         dst = (char *)dst + 16;
757     }
758 }
759 
rsdIntrinsicBlendSrcOver_K(void * dst,const void * src,uint32_t count8)760 void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
761     __m128i all1s, ina, ins;
762     __m128i in0, in1, out0, out1;
763     __m128i t0, t1, t2, t3;
764     uint32_t i;
765 
766     all1s = _mm_set1_epi16(255);
767 
768     for (i = 0; i < count8; ++i) {
769         in0 = _mm_loadu_si128((const __m128i *)src);
770         in1 = _mm_loadu_si128((const __m128i *)src + 1);
771         out0 = _mm_loadu_si128((const __m128i *)dst);
772         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
773 
774         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
775         ina = _mm_shufflelo_epi16(ins, 0xFF);
776         ina = _mm_shufflehi_epi16(ina, 0xFF);
777         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
778         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
779         t0 = _mm_srli_epi16(t0, 8);
780         t0 = _mm_add_epi16(t0, ins);
781 
782         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
783         ina = _mm_shufflelo_epi16(ins, 0xFF);
784         ina = _mm_shufflehi_epi16(ina, 0xFF);
785         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
786         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
787         t1 = _mm_srli_epi16(t1, 8);
788         t1 = _mm_add_epi16(t1, ins);
789 
790         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
791         ina = _mm_shufflelo_epi16(ins, 0xFF);
792         ina = _mm_shufflehi_epi16(ina, 0xFF);
793         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
794         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
795         t2 = _mm_srli_epi16(t2, 8);
796         t2 = _mm_add_epi16(t2, ins);
797 
798         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
799         ina = _mm_shufflelo_epi16(ins, 0xFF);
800         ina = _mm_shufflehi_epi16(ina, 0xFF);
801         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
802         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
803         t3 = _mm_srli_epi16(t3, 8);
804         t3 = _mm_add_epi16(t3, ins);
805 
806         t0 = _mm_packus_epi16(t0, t1);
807         t2 = _mm_packus_epi16(t2, t3);
808         _mm_storeu_si128((__m128i *)dst, t0);
809         _mm_storeu_si128((__m128i *)dst + 1, t2);
810 
811         src = (const __m128i *)src + 2;
812         dst = (__m128i *)dst + 2;
813     }
814 }
815 
rsdIntrinsicBlendDstOver_K(void * dst,const void * src,uint32_t count8)816 void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
817     __m128i all1s, outa, outs;
818     __m128i in0, in1, out0, out1;
819     __m128i t0, t1, t2, t3;
820     uint32_t i;
821 
822     all1s = _mm_set1_epi16(255);
823 
824     for (i = 0; i < count8; ++i) {
825         in0 = _mm_loadu_si128((const __m128i *)src);
826         in1 = _mm_loadu_si128((const __m128i *)src + 1);
827         out0 = _mm_loadu_si128((const __m128i *)dst);
828         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
829 
830 
831         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
832         outa = _mm_shufflelo_epi16(outs, 0xFF);
833         outa = _mm_shufflehi_epi16(outa, 0xFF);
834         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
835         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
836         t0 = _mm_srli_epi16(t0, 8);
837         t0 = _mm_add_epi16(t0, outs);
838 
839         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
840         outa = _mm_shufflelo_epi16(outs, 0xFF);
841         outa = _mm_shufflehi_epi16(outa, 0xFF);
842         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
843         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
844         t1 = _mm_srli_epi16(t1, 8);
845         t1 = _mm_add_epi16(t1, outs);
846 
847         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
848         outa = _mm_shufflelo_epi16(outs, 0xFF);
849         outa = _mm_shufflehi_epi16(outa, 0xFF);
850         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
851         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
852         t2 = _mm_srli_epi16(t2, 8);
853         t2 = _mm_add_epi16(t2, outs);
854 
855         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
856         outa = _mm_shufflelo_epi16(outs, 0xFF);
857         outa = _mm_shufflehi_epi16(outa, 0xFF);
858         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
859         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
860         t3 = _mm_srli_epi16(t3, 8);
861         t3 = _mm_add_epi16(t3, outs);
862 
863         t0 = _mm_packus_epi16(t0, t1);
864         t2 = _mm_packus_epi16(t2, t3);
865         _mm_storeu_si128((__m128i *)dst, t0);
866         _mm_storeu_si128((__m128i *)dst + 1, t2);
867 
868         src = (const __m128i *)src + 2;
869         dst = (__m128i *)dst + 2;
870     }
871 }
872 
rsdIntrinsicBlendSrcIn_K(void * dst,const void * src,uint32_t count8)873 void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
874     __m128i outa;
875     __m128i in0, in1, out0, out1;
876     __m128i t0, t1, t2, t3;
877     uint32_t i;
878 
879     for (i = 0; i < count8; ++i) {
880         in0 = _mm_loadu_si128((const __m128i *)src);
881         in1 = _mm_loadu_si128((const __m128i *)src + 1);
882         out0 = _mm_loadu_si128((const __m128i *)dst);
883         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
884 
885         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
886         outa = _mm_shufflelo_epi16(outa, 0xFF);
887         outa = _mm_shufflehi_epi16(outa, 0xFF);
888         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
889         t0 = _mm_mullo_epi16(t0, outa);
890         t0 = _mm_srli_epi16(t0, 8);
891 
892         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
893         outa = _mm_shufflelo_epi16(outa, 0xFF);
894         outa = _mm_shufflehi_epi16(outa, 0xFF);
895         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
896         t1 = _mm_mullo_epi16(t1, outa);
897         t1 = _mm_srli_epi16(t1, 8);
898 
899         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
900         outa = _mm_shufflelo_epi16(outa, 0xFF);
901         outa = _mm_shufflehi_epi16(outa, 0xFF);
902         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
903         t2 = _mm_mullo_epi16(t2, outa);
904         t2 = _mm_srli_epi16(t2, 8);
905 
906         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
907         outa = _mm_shufflelo_epi16(outa, 0xFF);
908         outa = _mm_shufflehi_epi16(outa, 0xFF);
909         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
910         t3 = _mm_mullo_epi16(t3, outa);
911         t3 = _mm_srli_epi16(t3, 8);
912 
913         t0 = _mm_packus_epi16(t0, t1);
914         t2 = _mm_packus_epi16(t2, t3);
915         _mm_storeu_si128((__m128i *)dst, t0);
916         _mm_storeu_si128((__m128i *)dst + 1, t2);
917 
918         src = (const __m128i *)src + 2;
919         dst = (__m128i *)dst + 2;
920     }
921 }
922 
rsdIntrinsicBlendDstIn_K(void * dst,const void * src,uint32_t count8)923 void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
924     __m128i ina;
925     __m128i in0, in1, out0, out1;
926     __m128i t0, t1, t2, t3;
927     uint32_t i;
928 
929     for (i = 0; i < count8; ++i) {
930         in0 = _mm_loadu_si128((const __m128i *)src);
931         in1 = _mm_loadu_si128((const __m128i *)src + 1);
932         out0 = _mm_loadu_si128((const __m128i *)dst);
933         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
934 
935         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
936         ina = _mm_shufflelo_epi16(ina, 0xFF);
937         ina = _mm_shufflehi_epi16(ina, 0xFF);
938         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
939         t0 = _mm_mullo_epi16(t0, ina);
940         t0 = _mm_srli_epi16(t0, 8);
941 
942         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
943         ina = _mm_shufflelo_epi16(ina, 0xFF);
944         ina = _mm_shufflehi_epi16(ina, 0xFF);
945         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
946         t1 = _mm_mullo_epi16(t1, ina);
947         t1 = _mm_srli_epi16(t1, 8);
948 
949         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
950         ina = _mm_shufflelo_epi16(ina, 0xFF);
951         ina = _mm_shufflehi_epi16(ina, 0xFF);
952         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
953         t2 = _mm_mullo_epi16(t2, ina);
954         t2 = _mm_srli_epi16(t2, 8);
955 
956         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
957         ina = _mm_shufflelo_epi16(ina, 0xFF);
958         ina = _mm_shufflehi_epi16(ina, 0xFF);
959         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
960         t3 = _mm_mullo_epi16(t3, ina);
961         t3 = _mm_srli_epi16(t3, 8);
962 
963         t0 = _mm_packus_epi16(t0, t1);
964         t2 = _mm_packus_epi16(t2, t3);
965         _mm_storeu_si128((__m128i *)dst, t0);
966         _mm_storeu_si128((__m128i *)dst + 1, t2);
967 
968         src = (const __m128i *)src + 2;
969         dst = (__m128i *)dst + 2;
970     }
971 }
972 
rsdIntrinsicBlendSrcOut_K(void * dst,const void * src,uint32_t count8)973 void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
974     __m128i all1s, outa;
975     __m128i in0, in1, out0, out1;
976     __m128i t0, t1, t2, t3;
977     uint32_t i;
978 
979     all1s = _mm_set1_epi16(255);
980 
981     for (i = 0; i < count8; ++i) {
982         in0 = _mm_loadu_si128((const __m128i *)src);
983         in1 = _mm_loadu_si128((const __m128i *)src + 1);
984         out0 = _mm_loadu_si128((const __m128i *)dst);
985         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
986 
987         outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
988         outa = _mm_shufflelo_epi16(outa, 0xFF);
989         outa = _mm_shufflehi_epi16(outa, 0xFF);
990         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
991         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
992         t0 = _mm_srli_epi16(t0, 8);
993 
994         outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
995         outa = _mm_shufflelo_epi16(outa, 0xFF);
996         outa = _mm_shufflehi_epi16(outa, 0xFF);
997         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
998         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
999         t1 = _mm_srli_epi16(t1, 8);
1000 
1001         outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1002         outa = _mm_shufflelo_epi16(outa, 0xFF);
1003         outa = _mm_shufflehi_epi16(outa, 0xFF);
1004         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1005         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1006         t2 = _mm_srli_epi16(t2, 8);
1007 
1008         outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1009         outa = _mm_shufflelo_epi16(outa, 0xFF);
1010         outa = _mm_shufflehi_epi16(outa, 0xFF);
1011         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1012         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1013         t3 = _mm_srli_epi16(t3, 8);
1014 
1015         t0 = _mm_packus_epi16(t0, t1);
1016         t2 = _mm_packus_epi16(t2, t3);
1017         _mm_storeu_si128((__m128i *)dst, t0);
1018         _mm_storeu_si128((__m128i *)dst + 1, t2);
1019 
1020         src = (const __m128i *)src + 2;
1021         dst = (__m128i *)dst + 2;
1022     }
1023 }
1024 
rsdIntrinsicBlendDstOut_K(void * dst,const void * src,uint32_t count8)1025 void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1026     __m128i all1s, ina;
1027     __m128i in0, in1, out0, out1;
1028     __m128i t0, t1, t2, t3;
1029     uint32_t i;
1030 
1031     all1s = _mm_set1_epi16(255);
1032 
1033     for (i = 0; i < count8; ++i) {
1034         in0 = _mm_loadu_si128((const __m128i *)src);
1035         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1036         out0 = _mm_loadu_si128((const __m128i *)dst);
1037         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1038 
1039         ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1040         ina = _mm_shufflelo_epi16(ina, 0xFF);
1041         ina = _mm_shufflehi_epi16(ina, 0xFF);
1042         t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1043         t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1044         t0 = _mm_srli_epi16(t0, 8);
1045 
1046         ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1047         ina = _mm_shufflelo_epi16(ina, 0xFF);
1048         ina = _mm_shufflehi_epi16(ina, 0xFF);
1049         t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1050         t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1051         t1 = _mm_srli_epi16(t1, 8);
1052 
1053         ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1054         ina = _mm_shufflelo_epi16(ina, 0xFF);
1055         ina = _mm_shufflehi_epi16(ina, 0xFF);
1056         t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1057         t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1058         t2 = _mm_srli_epi16(t2, 8);
1059 
1060         ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1061         ina = _mm_shufflelo_epi16(ina, 0xFF);
1062         ina = _mm_shufflehi_epi16(ina, 0xFF);
1063         t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1064         t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1065         t3 = _mm_srli_epi16(t3, 8);
1066 
1067         t0 = _mm_packus_epi16(t0, t1);
1068         t2 = _mm_packus_epi16(t2, t3);
1069         _mm_storeu_si128((__m128i *)dst, t0);
1070         _mm_storeu_si128((__m128i *)dst + 1, t2);
1071 
1072         src = (const __m128i *)src + 2;
1073         dst = (__m128i *)dst + 2;
1074     }
1075 }
1076 
rsdIntrinsicBlendSrcAtop_K(void * dst,const void * src,uint32_t count8)1077 void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1078     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1079     __m128i all1s, ina, outa, ins, outs;
1080     __m128i in0, in1, out0, out1;
1081     __m128i t0, t1, t2, t3;
1082     uint32_t i;
1083 
1084     all1s = _mm_set1_epi16(255);
1085 
1086     for (i = 0; i < count8; ++i) {
1087         in0 = _mm_loadu_si128((const __m128i *)src);
1088         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1089         out0 = _mm_loadu_si128((const __m128i *)dst);
1090         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1091 
1092         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1093         ina = _mm_shufflelo_epi16(ins, 0xFF);
1094         ina = _mm_shufflehi_epi16(ina, 0xFF);
1095         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1096         outa = _mm_shufflelo_epi16(outs, 0xFF);
1097         outa = _mm_shufflehi_epi16(outa, 0xFF);
1098         t0 = _mm_sub_epi16(all1s, ina);
1099         t0 = _mm_mullo_epi16(t0, outs);
1100         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1101         t0 = _mm_srli_epi16(t0, 8);
1102 
1103         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1104         ina = _mm_shufflelo_epi16(ins, 0xFF);
1105         ina = _mm_shufflehi_epi16(ina, 0xFF);
1106         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1107         outa = _mm_shufflelo_epi16(outs, 0xFF);
1108         outa = _mm_shufflehi_epi16(outa, 0xFF);
1109         t1 = _mm_sub_epi16(all1s, ina);
1110         t1 = _mm_mullo_epi16(t1, outs);
1111         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1112         t1 = _mm_srli_epi16(t1, 8);
1113 
1114         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1115         ina = _mm_shufflelo_epi16(ins, 0xFF);
1116         ina = _mm_shufflehi_epi16(ina, 0xFF);
1117         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1118         outa = _mm_shufflelo_epi16(outs, 0xFF);
1119         outa = _mm_shufflehi_epi16(outa, 0xFF);
1120         t2 = _mm_sub_epi16(all1s, ina);
1121         t2 = _mm_mullo_epi16(t2, outs);
1122         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1123         t2 = _mm_srli_epi16(t2, 8);
1124 
1125         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1126         ina = _mm_shufflelo_epi16(ins, 0xFF);
1127         ina = _mm_shufflehi_epi16(ina, 0xFF);
1128         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1129         outa = _mm_shufflelo_epi16(outs, 0xFF);
1130         outa = _mm_shufflehi_epi16(outa, 0xFF);
1131         t3 = _mm_sub_epi16(all1s, ina);
1132         t3 = _mm_mullo_epi16(t3, outs);
1133         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1134         t3 = _mm_srli_epi16(t3, 8);
1135 
1136         t0 = _mm_packus_epi16(t0, t1);
1137         t0 = blendv_epi8(t0, out0, M0001);
1138         t2 = _mm_packus_epi16(t2, t3);
1139         t2 = blendv_epi8(t2, out1, M0001);
1140         _mm_storeu_si128((__m128i *)dst, t0);
1141         _mm_storeu_si128((__m128i *)dst + 1, t2);
1142 
1143         src = (const __m128i *)src + 2;
1144         dst = (__m128i *)dst + 2;
1145     }
1146 }
1147 
rsdIntrinsicBlendDstAtop_K(void * dst,const void * src,uint32_t count8)1148 void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1149     const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1150     __m128i all1s, ina, ins, outa, outs;
1151     __m128i in0, in1, out0, out1;
1152     __m128i t0, t1, t2, t3;
1153     uint32_t i;
1154 
1155     all1s = _mm_set1_epi16(255);
1156 
1157     for (i = 0; i < count8; ++i) {
1158         in0 = _mm_loadu_si128((const __m128i *)src);
1159         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1160         out0 = _mm_loadu_si128((const __m128i *)dst);
1161         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1162 
1163         ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1164         ina = _mm_shufflelo_epi16(ins, 0xFF);
1165         ina = _mm_shufflehi_epi16(ina, 0xFF);
1166         outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1167         outa = _mm_shufflelo_epi16(outs, 0xFF);
1168         outa = _mm_shufflehi_epi16(outa, 0xFF);
1169         t0 = _mm_sub_epi16(all1s, outa);
1170         t0 = _mm_mullo_epi16(t0, ins);
1171         t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1172         t0 = _mm_srli_epi16(t0, 8);
1173 
1174         ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1175         ina = _mm_shufflelo_epi16(ins, 0xFF);
1176         ina = _mm_shufflehi_epi16(ina, 0xFF);
1177         outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1178         outa = _mm_shufflelo_epi16(outs, 0xFF);
1179         outa = _mm_shufflehi_epi16(outa, 0xFF);
1180         t1 = _mm_sub_epi16(all1s, outa);
1181         t1 = _mm_mullo_epi16(t1, ins);
1182         t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1183         t1 = _mm_srli_epi16(t1, 8);
1184 
1185         ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1186         ina = _mm_shufflelo_epi16(ins, 0xFF);
1187         ina = _mm_shufflehi_epi16(ina, 0xFF);
1188         outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1189         outa = _mm_shufflelo_epi16(outs, 0xFF);
1190         outa = _mm_shufflehi_epi16(outa, 0xFF);
1191         t2 = _mm_sub_epi16(all1s, outa);
1192         t2 = _mm_mullo_epi16(t2, ins);
1193         t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1194         t2 = _mm_srli_epi16(t2, 8);
1195 
1196         ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1197         ina = _mm_shufflelo_epi16(ins, 0xFF);
1198         ina = _mm_shufflehi_epi16(ina, 0xFF);
1199         outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1200         outa = _mm_shufflelo_epi16(outs, 0xFF);
1201         outa = _mm_shufflehi_epi16(outa, 0xFF);
1202         t3 = _mm_sub_epi16(all1s, outa);
1203         t3 = _mm_mullo_epi16(t3, ins);
1204         t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1205         t3 = _mm_srli_epi16(t3, 8);
1206 
1207         t0 = _mm_packus_epi16(t0, t1);
1208         t0 = blendv_epi8(t0, in0, M0001);
1209         t2 = _mm_packus_epi16(t2, t3);
1210         t2 = blendv_epi8(t2, in1, M0001);
1211         _mm_storeu_si128((__m128i *)dst, t0);
1212         _mm_storeu_si128((__m128i *)dst + 1, t2);
1213 
1214         src = (const __m128i *)src + 2;
1215         dst = (__m128i *)dst + 2;
1216     }
1217 }
1218 
rsdIntrinsicBlendXor_K(void * dst,const void * src,uint32_t count8)1219 void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1220     __m128i in0, in1, out0, out1;
1221     uint32_t i;
1222 
1223     for (i = 0; i < count8; ++i) {
1224         in0 = _mm_loadu_si128((const __m128i *)src);
1225         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1226         out0 = _mm_loadu_si128((const __m128i *)dst);
1227         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1228 
1229         out0 = _mm_xor_si128(out0, in0);
1230         out1 = _mm_xor_si128(out1, in1);
1231 
1232         _mm_storeu_si128((__m128i *)dst, out0);
1233         _mm_storeu_si128((__m128i *)dst + 1, out1);
1234 
1235         src = (const __m128i *)src + 2;
1236         dst = (__m128i *)dst + 2;
1237     }
1238 }
1239 
rsdIntrinsicBlendMultiply_K(void * dst,const void * src,uint32_t count8)1240 void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1241     __m128i in0, in1, out0, out1;
1242     __m128i t0, t1, t2, t3;
1243     uint32_t i;
1244 
1245     for (i = 0; i < count8; ++i) {
1246         in0 = _mm_loadu_si128((const __m128i *)src);
1247         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1248         out0 = _mm_loadu_si128((const __m128i *)dst);
1249         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1250 
1251         t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1252         t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1253         t0 = _mm_srli_epi16(t0, 8);
1254 
1255         t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1256         t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1257         t1 = _mm_srli_epi16(t1, 8);
1258 
1259         t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1260         t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1261         t2 = _mm_srli_epi16(t2, 8);
1262 
1263         t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1264         t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1265         t3 = _mm_srli_epi16(t3, 8);
1266 
1267         t0 = _mm_packus_epi16(t0, t1);
1268         t2 = _mm_packus_epi16(t2, t3);
1269         _mm_storeu_si128((__m128i *)dst, t0);
1270         _mm_storeu_si128((__m128i *)dst + 1, t2);
1271 
1272         src = (const __m128i *)src + 2;
1273         dst = (__m128i *)dst + 2;
1274     }
1275 }
1276 
rsdIntrinsicBlendAdd_K(void * dst,const void * src,uint32_t count8)1277 void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1278     __m128i in0, in1, out0, out1;
1279     uint32_t i;
1280 
1281     for (i = 0; i < count8; ++i) {
1282         in0 = _mm_loadu_si128((const __m128i *)src);
1283         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1284         out0 = _mm_loadu_si128((const __m128i *)dst);
1285         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1286 
1287         out0 = _mm_adds_epu8(out0, in0);
1288         out1 = _mm_adds_epu8(out1, in1);
1289 
1290         _mm_storeu_si128((__m128i *)dst, out0);
1291         _mm_storeu_si128((__m128i *)dst + 1, out1);
1292 
1293         src = (const __m128i *)src + 2;
1294         dst = (__m128i *)dst + 2;
1295     }
1296 }
1297 
rsdIntrinsicBlendSub_K(void * dst,const void * src,uint32_t count8)1298 void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1299     __m128i in0, in1, out0, out1;
1300     uint32_t i;
1301 
1302     for (i = 0; i < count8; ++i) {
1303         in0 = _mm_loadu_si128((const __m128i *)src);
1304         in1 = _mm_loadu_si128((const __m128i *)src + 1);
1305         out0 = _mm_loadu_si128((const __m128i *)dst);
1306         out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1307 
1308         out0 = _mm_subs_epu8(out0, in0);
1309         out1 = _mm_subs_epu8(out1, in1);
1310 
1311         _mm_storeu_si128((__m128i *)dst, out0);
1312         _mm_storeu_si128((__m128i *)dst + 1, out1);
1313 
1314         src = (const __m128i *)src + 2;
1315         dst = (__m128i *)dst + 2;
1316     }
1317 }
1318