1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20
21.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
22
23            smov        x6, \src0
24            smov        x7, \src1
25
26            add         x6, x6, x3
27            add         x7, x7, x3
28
29            ld1         {v16.2s}, [x6], x4
30            ld1         {v17.2s}, [x7], x4
31
32            ld1         {v18.2s}, [x6], x5
33            ld1         {v19.2s}, [x7], x5
34
35            dup         v8.8b, \yr0
36            dup         v9.8b, \yr1
37            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
38            zip1        v12.16b, v5.16b, v16.16b
39            zip1        v13.16b, v5.16b, v17.16b
40            umlsl       v12.8h, v16.8b, v8.8b
41            umlsl       v13.8h, v17.8b, v9.8b
42            umlal       v12.8h, v18.8b, v8.8b
43            umlal       v13.8h, v19.8b, v9.8b
44
45            ld1         {v18.2s}, [x6]
46            ld1         {v19.2s}, [x7]
47
48            sub         x6, x6, x4
49            sub         x7, x7, x4
50
51            ld1         {v16.2s}, [x6]
52            ld1         {v17.2s}, [x7]
53
54            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
55            zip1        v14.16b, v5.16b, v16.16b
56            zip1        v15.16b, v5.16b, v17.16b
57            umlsl       v14.8h, v16.8b, v8.8b
58            umlsl       v15.8h, v17.8b, v9.8b
59            umlal       v14.8h, v18.8b, v8.8b
60            umlal       v15.8h, v19.8b, v9.8b
61
62            /* Z interpolate, lane 0 v12/v14 -> v10 */
63            ushll       v8.4s, v12.4h, #8
64            ushll2      v9.4s, v12.8h, #8
65            umlsl       v8.4s, v12.4h, \zr0
66            umlsl2      v9.4s, v12.8h, \zr0
67            umlal       v8.4s, v14.4h, \zr0
68            umlal2      v9.4s, v14.8h, \zr0
69            rshrn       v10.4h, v8.4s, #8
70            rshrn2      v10.8h, v9.4s, #8
71
72            /* Z interpolate, lane 1 v13/v15 -> v11 */
73            ushll       v8.4s, v13.4h, #8
74            ushll2      v9.4s, v13.8h, #8
75            umlsl       v8.4s, v13.4h, \zr1
76            umlsl2      v9.4s, v13.8h, \zr1
77            umlal       v8.4s, v15.4h, \zr1
78            umlal2      v9.4s, v15.8h, \zr1
79            rshrn       v11.4h, v8.4s, #8
80            rshrn2      v11.8h, v9.4s, #8
81
82            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
83            ushll       v8.4s, v10.4h, #8
84            ushll       v9.4s, v11.4h, #8
85            umlsl       v8.4s, v10.4h, \xr0
86            umlsl       v9.4s, v11.4h, \xr1
87            umlal2      v8.4s, v10.8h, \xr0
88            umlal2      v9.4s, v11.8h, \xr1
89            shrn        v14.4h, v8.4s, #8
90            shrn2       v14.8h, v9.4s, #8
91
92            /* pack lanes 0-1 -> v6 */
93.ifc \dst, v20.16b
94            uqrshrn2    \dst, v14.8h, #8
95.else ; .ifc \dst, v21.16b
96            uqrshrn2    \dst, v14.8h, #8
97.else
98            uqrshrn     \dst, v14.8h, #8
99.endif ; .endif
100.endm
101
102/* void rsdIntrinsic3DLUT_K(
103 *          void *dst,          // x0
104 *          void const *in,     // x1
105 *          size_t count,       // x2
106 *          void const *lut,    // x3
107 *          int32_t pitchy,     // w4
108 *          int32_t pitchz,     // w5
109 *          int dimx,           // w6
110 *          int dimy,           // w7
111 *          int dimz);          // [sp]
112 */
113ENTRY(rsdIntrinsic3DLUT_K)
114            ldr         w8, [sp]
115            stp         d8, d9, [sp, #-64]!
116            stp         d10, d11, [sp, #16]
117            stp         d12, d13, [sp, #32]
118            stp         d14, d15, [sp, #48]
119            movi        v4.8b, #1
120            ins         v4.h[0], w6
121            ins         v4.h[1], w7
122            ins         v4.h[2], w8
123            ins         v4.s[2], w4
124            ins         v4.s[3], w5
125            movi        v5.16b, #0
126
127            subs        x2, x2, #8
128            bge         2f
129            cmn         x2, #8    // same as cmp x2, #-8
130            ble         9f
131            b           4f
132
133            .align 6
1341:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
135/* x0  = dst
136 * x1  = src
137 * x2  = count
138 * x3  = lut
139 * x4  = pitchy
140 * x5  = pitchz
141 * x6 = offset0
142 * x7 = offset1
143 */
1442:          ld4         {v0.8b-v3.8b}, [x1], #32
145/* v0,v1,v2,v3 source data
146 * v4 dimensions and pitches
147 */
1483:          uxtl        v0.8h, v0.8b
149            uxtl        v1.8h, v1.8b
150            uxtl        v2.8h, v2.8b
151            mul         v0.8h, v0.8h, v4.h[0]
152            mul         v1.8h, v1.8h, v4.h[1]
153            mul         v2.8h, v2.8h, v4.h[2]
154
155/* ursra below would be more accurate, but this can result in a dim.0 case
156 * where we try to read from the limit of the array and the limit +1 to
157 * interpolate, even though the fractional component is zero.  Strictly this is
158 * correct, except for the llegal access problem.
159 */
160            usra        v0.8h, v0.8h, #8
161            usra        v1.8h, v1.8h, #8
162            usra        v2.8h, v2.8h, #8
163
164            ushr        v12.8h, v0.8h, #8
165            ushr        v13.8h, v1.8h, #8
166            ushr        v14.8h, v2.8h, #8
167            bic         v0.8h, #0xff, LSL #8
168            xtn         v1.8b, v1.8h
169            bic         v2.8h, #0xff, LSL #8
170
171/* v0.8h,v1.8b,v2.hb fractional offset
172 * v12.8h,v13.8h,v14.8h integer offset
173 */
174
175            ushll       v6.4s, v12.4h, #2
176            ushll2      v7.4s, v12.8h, #2
177            uxtl        v8.4s, v13.4h
178            uxtl2       v9.4s, v13.8h
179            uxtl        v10.4s, v14.4h
180            uxtl2       v11.4s, v14.8h
181            mla         v6.4s, v8.4s,  v4.s[2]
182            mla         v7.4s, v9.4s,  v4.s[2]
183            mla         v6.4s, v10.4s, v4.s[3]
184            mla         v7.4s, v11.4s, v4.s[3]
185
186/* v6,v7 list of table offsets */
187
188        /* lanes 0 and 1 */
189            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
190
191        /* lanes 2 and 3 */
192            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
193
194        /* lanes 4 and 5 */
195            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
196
197        /* lanes 6 and 7 */
198            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
199
200            uzp1        v6.16b, v20.16b, v21.16b
201            uzp2        v7.16b, v20.16b, v21.16b
202            uzp1        v20.16b, v6.16b, v7.16b
203            uzp2        v22.16b, v6.16b, v7.16b
204            mov         v21.d[0], v20.d[1]
205
206            subs        x2, x2, #8
207            mov         v23.8b, v3.8b
208
209            bge         1b
210
211            cmn         x2, #8    // same as cmp x2, #-8
212            blt         1f
213
214            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
215            beq         9f
216
217            /* fill the vector  with a safe value */
2184:          ld4r        {v0.8b-v3.8b}, [x1]
219            tbz         x2, #2, 2f
220            ld4         {v0.b-v3.b}[0], [x1], #4
221            ld4         {v0.b-v3.b}[1], [x1], #4
222            ld4         {v0.b-v3.b}[2], [x1], #4
223            ld4         {v0.b-v3.b}[3], [x1], #4
2242:          tbz         x2, #1, 2f
225            ld4         {v0.b-v3.b}[4], [x1], #4
226            ld4         {v0.b-v3.b}[5], [x1], #4
2272:          tbz         x2, #0, 2f
228            ld4         {v0.b-v3.b}[6], [x1], #4
2292:          b           3b
230
2311:          tst         x2, #4
232            beq         2f
233            st4         {v20.b-v23.b}[0], [x0], #4
234            st4         {v20.b-v23.b}[1], [x0], #4
235            st4         {v20.b-v23.b}[2], [x0], #4
236            st4         {v20.b-v23.b}[3], [x0], #4
2372:          tst         x2, #2
238            beq         2f
239            st4         {v20.b-v23.b}[4], [x0], #4
240            st4         {v20.b-v23.b}[5], [x0], #4
2412:          tst         x2, #1
242            beq         9f
243            st4         {v20.b-v23.b}[6], [x0], #4
244
2459:          ldp         d14, d15, [sp, #48]
246            ldp         d12, d13, [sp, #32]
247            ldp         d10, d11, [sp, #16]
248            ldp         d8, d9, [sp], #64
249            ret
250END(rsdIntrinsic3DLUT_K)
251