/* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart #define END(f) .fnend; .size f, .-f; .eabi_attribute 25,1 @Tag_ABI_align8_preserved .arm .macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1 vmov r6, r7, \src add r6, r6, r3 add r7, r7, r3 vld1.u8 d16, [r6], r4 vld1.u8 d17, [r7], r4 vld1.u8 d18, [r6], r5 vld1.u8 d19, [r7], r5 vdup.u8 d6, \yr0 vdup.u8 d7, \yr1 /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */ vshll.u8 q12, d16, #8 vshll.u8 q13, d17, #8 vmlsl.u8 q12, d16, d6 vmlsl.u8 q13, d17, d7 vmlal.u8 q12, d18, d6 vmlal.u8 q13, d19, d7 vld1.u8 d18, [r6] vld1.u8 d19, [r7] sub r6, r6, r4 sub r7, r7, r4 vld1.u8 d16, [r6] vld1.u8 d17, [r7] /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */ vshll.u8 q14, d16, #8 vshll.u8 q15, d17, #8 vmlsl.u8 q14, d16, d6 vmlsl.u8 q15, d17, d7 vmlal.u8 q14, d18, d6 vmlal.u8 q15, d19, d7 /* Z interpolate, lane 0 q12/q14 -> q10 */ vshll.u16 q8, d24, #8 vshll.u16 q9, d25, #8 vmlsl.u16 q8, d24, \zr0 vmlsl.u16 q9, d25, \zr0 vmlal.u16 q8, d28, \zr0 vmlal.u16 q9, d29, \zr0 vrshrn.u32 d20, q8, #8 vrshrn.u32 d21, q9, #8 /* Z interpolate, lane 1 q13/q15 -> q11 */ vshll.u16 q8, d26, #8 vshll.u16 q9, d27, #8 vmlsl.u16 q8, d26, \zr1 vmlsl.u16 q9, d27, \zr1 vmlal.u16 q8, d30, \zr1 vmlal.u16 q9, d31, \zr1 vrshrn.u32 d22, q8, #8 vrshrn.u32 d23, q9, #8 /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */ vshll.u16 q8, d20, #8 vshll.u16 q9, d22, #8 vmlsl.u16 q8, d20, \xr0 vmlsl.u16 q9, d22, \xr1 vmlal.u16 q8, d21, \xr0 vmlal.u16 q9, d23, \xr1 vshrn.u32 d28, q8, #8 vshrn.u32 d29, q9, #8 /* pack lanes 0-1 -> d12 */ vqrshrn.u16 \dst, q14, #8 .endm /* void rsdIntrinsic3DLUT_K( * void *dst, // r0 * void const *in, // r1 * size_t count, // r2 * void const *lut, // r3 * int32_t pitchy, // [sp] * int32_t pitchz, // [sp+#4] * int dimx, // [sp+#8] * int dimy, // [sp+#12] * int dimz); // [sp+#16] */ ENTRY(rsdIntrinsic3DLUT_K) push {r4,r5,r6,r7} ldr r4, [sp, #16] ldr r5, [sp, #20] ldr r6, [sp, #24] ldr r7, [sp, #28] ldr r12, [sp, #32] vpush {d8-d15} vmov.u8 d8, #1 vmov.u16 d8[0], r6 vmov.u16 d8[1], r7 vmov.u16 d8[2], r12 vmov d9, r4, r5 subs r2, #8 bge 2f cmp r2, #-8 ble 9f b 4f .align 6 1: vst4.u8 {d12,d13,d14,d15}, [r0]! /* r0 = dst * r1 = src * r2 = count * r3 = lut * r4 = pitchy * r5 = pitchz * r6 = offset0 * r7 = offset1 */ 2: vld4.u8 {d0,d2,d4,d6}, [r1]! 3: vmov d10, d6 /* q0,q1,q2,q5 source data * q4 dimensions and pitches * q3, scratch register for scalar access */ vmov q3, q4 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vmul.u16 q0, q0, d6[0] vmul.u16 q1, q1, d6[1] vmul.u16 q2, q2, d6[2] /* vrsra.u16 below would be more accurate, but this can result in a dim.0 case * where we try to read from the limit of the array and the limit +1 to * interpolate, even though the fractional component is zero. Strictly this is * correct, except for the llegal access problem. */ vsra.u16 q0, q0, #8 vsra.u16 q1, q1, #8 vsra.u16 q2, q2, #8 vshr.u16 q12, q0, #8 vshr.u16 q13, q1, #8 vshr.u16 q14, q2, #8 vbic.u16 q0, #0xff00 vmovn.u16 d2, q1 vbic.u16 q2, #0xff00 /* q0,d2,q2 fractional offset * q12,q13,q14 integer offset */ vshll.u16 q6, d24, #2 vshll.u16 q7, d25, #2 vmovl.u16 q8, d26 vmovl.u16 q9, d27 vmovl.u16 q10, d28 vmovl.u16 q11, d29 vmla.s32 q6, q8, d9[0] vmla.s32 q7, q9, d9[0] vmla.s32 q6, q10, d9[1] vmla.s32 q7, q11, d9[1] /* q6,q7 list of table offsets */ /* lanes 0 and 1 */ lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1] /* lanes 2 and 3 */ lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3] /* lanes 4 and 5 */ lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1] /* lanes 6 and 7 */ lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3] vuzp.u8 d12, d13 vuzp.u8 d14, d15 vuzp.u8 d12, d14 vuzp.u8 d13, d15 subs r2, r2, #8 vmov.u8 d15, d10 bge 1b cmp r2, #-8 blt 1f vst4.u8 {d12,d13,d14,d15}, [r0]! beq 9f /* fill the vector with a safe value */ 4: vld1.u32 {d0[]}, [r1] vmov d2, d0 vmov d4, d0 vmov d6, d0 tst r2, #4 beq 2f vld1.u32 {d0}, [r1]! vld1.u32 {d2}, [r1]! 2: tst r2, #2 beq 2f vld1.u32 {d4}, [r1]! 2: tst r2, #1 beq 2f vld1.u32 {d6[0]}, [r1]! 2: vuzp.8 d0, d2 vuzp.8 d4, d6 vuzp.8 d0, d4 vuzp.8 d2, d6 b 3b 1: vzip.8 d12, d14 vzip.8 d13, d15 vzip.8 d12, d13 vzip.8 d14, d15 tst r2, #4 beq 2f vst1.u32 {d12,d13}, [r0]! 2: tst r2, #2 beq 2f vst1.u32 {d14}, [r0]! 2: tst r2, #1 beq 9f vst1.u32 {d15[0]}, [r0]! 9: mov r0, #0 vpop {d8-d15} pop {r4,r5,r6,r7} bx lr END(rsdIntrinsic3DLUT_K)