1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
18#define END(f) .fnend; .size f, .-f;
19
20.eabi_attribute 25,1 @Tag_ABI_align8_preserved
21.arm
22
23.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
24
25            vmov        r6, r7, \src
26
27            add         r6, r6, r3
28            add         r7, r7, r3
29
30            vld1.u8     d16, [r6], r4
31            vld1.u8     d17, [r7], r4
32
33            vld1.u8     d18, [r6], r5
34            vld1.u8     d19, [r7], r5
35
36            vdup.u8     d6, \yr0
37            vdup.u8     d7, \yr1
38            /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
39            vshll.u8    q12, d16, #8
40            vshll.u8    q13, d17, #8
41            vmlsl.u8    q12, d16, d6
42            vmlsl.u8    q13, d17, d7
43            vmlal.u8    q12, d18, d6
44            vmlal.u8    q13, d19, d7
45
46            vld1.u8     d18, [r6]
47            vld1.u8     d19, [r7]
48
49            sub         r6, r6, r4
50            sub         r7, r7, r4
51
52            vld1.u8     d16, [r6]
53            vld1.u8     d17, [r7]
54
55            /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
56            vshll.u8    q14, d16, #8
57            vshll.u8    q15, d17, #8
58            vmlsl.u8    q14, d16, d6
59            vmlsl.u8    q15, d17, d7
60            vmlal.u8    q14, d18, d6
61            vmlal.u8    q15, d19, d7
62
63            /* Z interpolate, lane 0 q12/q14 -> q10 */
64            vshll.u16   q8, d24, #8
65            vshll.u16   q9, d25, #8
66            vmlsl.u16   q8, d24, \zr0
67            vmlsl.u16   q9, d25, \zr0
68            vmlal.u16   q8, d28, \zr0
69            vmlal.u16   q9, d29, \zr0
70            vrshrn.u32  d20, q8, #8
71            vrshrn.u32  d21, q9, #8
72
73            /* Z interpolate, lane 1 q13/q15 -> q11 */
74            vshll.u16   q8, d26, #8
75            vshll.u16   q9, d27, #8
76            vmlsl.u16   q8, d26, \zr1
77            vmlsl.u16   q9, d27, \zr1
78            vmlal.u16   q8, d30, \zr1
79            vmlal.u16   q9, d31, \zr1
80            vrshrn.u32  d22, q8, #8
81            vrshrn.u32  d23, q9, #8
82
83            /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
84            vshll.u16   q8, d20, #8
85            vshll.u16   q9, d22, #8
86            vmlsl.u16   q8, d20, \xr0
87            vmlsl.u16   q9, d22, \xr1
88            vmlal.u16   q8, d21, \xr0
89            vmlal.u16   q9, d23, \xr1
90            vshrn.u32   d28, q8, #8
91            vshrn.u32   d29, q9, #8
92
93            /* pack lanes 0-1 -> d12 */
94            vqrshrn.u16  \dst, q14, #8
95.endm
96
97/* void rsdIntrinsic3DLUT_K(
98 *          void *dst,          // r0
99 *          void const *in,     // r1
100 *          size_t count,       // r2
101 *          void const *lut,    // r3
102 *          int32_t pitchy,     // [sp]
103 *          int32_t pitchz,     // [sp+#4]
104 *          int dimx,           // [sp+#8]
105 *          int dimy,           // [sp+#12]
106 *          int dimz);          // [sp+#16]
107 */
108ENTRY(rsdIntrinsic3DLUT_K)
109            push        {r4,r5,r6,r7}
110            ldr         r4, [sp, #16]
111            ldr         r5, [sp, #20]
112            ldr         r6, [sp, #24]
113            ldr         r7, [sp, #28]
114            ldr         r12, [sp, #32]
115            vpush       {d8-d15}
116
117            vmov.u8     d8, #1
118            vmov.u16    d8[0], r6
119            vmov.u16    d8[1], r7
120            vmov.u16    d8[2], r12
121            vmov        d9, r4, r5
122
123            subs        r2, #8
124            bge         2f
125            cmp         r2, #-8
126            ble         9f
127            b           4f
128
129            .align 6
1301:          vst4.u8     {d12,d13,d14,d15}, [r0]!
131/* r0  = dst
132 * r1  = src
133 * r2  = count
134 * r3  = lut
135 * r4  = pitchy
136 * r5  = pitchz
137 * r6 = offset0
138 * r7 = offset1
139 */
1402:          vld4.u8     {d0,d2,d4,d6}, [r1]!
1413:          vmov        d10, d6
142/* q0,q1,q2,q5 source data
143 * q4 dimensions and pitches
144 * q3, scratch register for scalar access
145 */
146            vmov        q3, q4
147            vmovl.u8    q0, d0
148            vmovl.u8    q1, d2
149            vmovl.u8    q2, d4
150            vmul.u16    q0, q0, d6[0]
151            vmul.u16    q1, q1, d6[1]
152            vmul.u16    q2, q2, d6[2]
153
154/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
155 * where we try to read from the limit of the array and the limit +1 to
156 * interpolate, even though the fractional component is zero.  Strictly this is
157 * correct, except for the llegal access problem.
158 */
159            vsra.u16    q0, q0, #8
160            vsra.u16    q1, q1, #8
161            vsra.u16    q2, q2, #8
162
163            vshr.u16    q12, q0, #8
164            vshr.u16    q13, q1, #8
165            vshr.u16    q14, q2, #8
166
167            vbic.u16    q0, #0xff00
168            vmovn.u16   d2, q1
169            vbic.u16    q2, #0xff00
170
171/* q0,d2,q2 fractional offset
172 * q12,q13,q14 integer offset
173 */
174
175            vshll.u16   q6, d24, #2
176            vshll.u16   q7, d25, #2
177            vmovl.u16   q8, d26
178            vmovl.u16   q9, d27
179            vmovl.u16   q10, d28
180            vmovl.u16   q11, d29
181            vmla.s32    q6, q8,  d9[0]
182            vmla.s32    q7, q9,  d9[0]
183            vmla.s32    q6, q10, d9[1]
184            vmla.s32    q7, q11, d9[1]
185
186/* q6,q7 list of table offsets */
187
188        /* lanes 0 and 1 */
189            lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
190
191        /* lanes 2 and 3 */
192            lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
193
194        /* lanes 4 and 5 */
195            lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
196
197        /* lanes 6 and 7 */
198            lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
199
200            vuzp.u8     d12, d13
201            vuzp.u8     d14, d15
202            vuzp.u8     d12, d14
203            vuzp.u8     d13, d15
204
205            subs        r2, r2, #8
206            vmov.u8     d15, d10
207
208            bge         1b
209
210            cmp         r2, #-8
211            blt         1f
212
213            vst4.u8     {d12,d13,d14,d15}, [r0]!
214
215            beq         9f
216
217            /* fill the vector with a safe value */
2184:          vld1.u32    {d0[]}, [r1]
219            vmov        d2, d0
220            vmov        d4, d0
221            vmov        d6, d0
222            tst         r2, #4
223            beq         2f
224            vld1.u32    {d0}, [r1]!
225            vld1.u32    {d2}, [r1]!
2262:          tst         r2, #2
227            beq         2f
228            vld1.u32    {d4}, [r1]!
2292:          tst         r2, #1
230            beq         2f
231            vld1.u32    {d6[0]}, [r1]!
2322:          vuzp.8      d0, d2
233            vuzp.8      d4, d6
234            vuzp.8      d0, d4
235            vuzp.8      d2, d6
236            b           3b
237
2381:          vzip.8      d12, d14
239            vzip.8      d13, d15
240            vzip.8      d12, d13
241            vzip.8      d14, d15
242            tst         r2, #4
243            beq         2f
244            vst1.u32    {d12,d13}, [r0]!
2452:          tst         r2, #2
246            beq         2f
247            vst1.u32    {d14}, [r0]!
2482:          tst         r2, #1
249            beq         9f
250            vst1.u32    {d15[0]}, [r0]!
251
2529:          mov         r0, #0
253            vpop        {d8-d15}
254            pop         {r4,r5,r6,r7}
255            bx lr
256END(rsdIntrinsic3DLUT_K)
257