1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart 18#define END(f) .fnend; .size f, .-f; 19 20.eabi_attribute 25,1 @Tag_ABI_align8_preserved 21.arm 22 23/* Perform the actual YuvToRGB conversion in a macro, from register to 24 * register. This macro will be called from within several different wrapper 25 * variants for different data layouts. Y data starts in q8, but with the even 26 * and odd bytes split into d16 and d17 respectively. U and V are in d20 27 * and d21. Working constants are pre-loaded into q13-q15, and q3 is 28 * pre-loaded with a constant 0xff alpha channel. 29 * 30 * The complicated arithmetic is the result of refactoring the original 31 * equations to avoid 16-bit overflow without losing any precision. 32 */ 33.macro yuvkern 34 vmov.i8 d15, #149 35 36 vmull.u8 q1, d16, d15 // g0 = y0 * 149 37 vmull.u8 q5, d17, d15 // g1 = y1 * 149 38 39 vmov.i8 d14, #50 40 vmov.i8 d15, #104 41 vmull.u8 q8, d20, d14 // g2 = u * 50 + v * 104 42 vmlal.u8 q8, d21, d15 43 44 vshr.u8 d14, d21, #1 45 vaddw.u8 q0, q1, d14 // r0 = y0 * 149 + (v >> 1) 46 vaddw.u8 q4, q5, d14 // r1 = y1 * 149 + (v >> 1) 47 48 vshll.u8 q7, d20, #2 49 vadd.u16 q2, q1, q7 // b0 = y0 * 149 + (u << 2) 50 vadd.u16 q6, q5, q7 // b1 = y1 * 149 + (u << 2) 51 52 vmov.i8 d14, #204 53 vmov.i8 d15, #254 54 vmull.u8 q11, d21, d14 // r2 = v * 204 55 vmull.u8 q12, d20, d15 // b2 = u * 254 56 57 vhadd.u16 q0, q11 // r0 = (r0 + r2) >> 1 58 vhadd.u16 q4, q11 // r1 = (r1 + r2) >> 1 59 vqadd.u16 q1, q14 // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 60 vqadd.u16 q5, q14 // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 61 vhadd.u16 q2, q12 // b0 = (b0 + b2) >> 1 62 vhadd.u16 q6, q12 // b1 = (b1 + b2) >> 1 63 64 vqsub.u16 q0, q13 // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 65 vqsub.u16 q4, q13 // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 66 vqsub.u16 q1, q8 // g0 = satu16(g0 - g2) 67 vqsub.u16 q5, q8 // g1 = satu16(g1 - g2) 68 vqsub.u16 q2, q15 // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 69 vqsub.u16 q6, q15 // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 70 71 vqrshrn.u16 d0, q0, #6 72 vqrshrn.u16 d1, q1, #7 73 vqrshrn.u16 d2, q4, #6 74 vqrshrn.u16 d3, q5, #7 75 vqrshrn.u16 d4, q2, #6 76 vqrshrn.u16 d5, q6, #6 77 78 vzip.u8 q0, q1 79 vzip.u8 d4, d5 80.endm 81 82/* Define the wrapper code which will load and store the data, iterate the 83 * correct number of times, and safely handle the remainder at the end of the 84 * loop. Some sections of code are switched out depending on the data packing 85 * being handled. 86 */ 87.macro wrap_line kernel, interleaved=0, swapuv=0 88 89 movw r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 90 vdup.i16 q13, r5 91 movw r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 92 vdup.i16 q14, r5 93 movw r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) 94 vdup.i16 q15, r5 95 96 vmov.i8 q3, #0xff 97 98 subs r2, #16 99 bhs 1f 100 b 2f 101 102 .align 4 1031: vld2.u8 {d16,d17}, [r1]! 104 pld [r1, #256] 105 .if \interleaved 106 vld2.u8 {d20,d21}, [r3]! 107 .if \swapuv 108 vswp d20, d21 109 .endif 110 pld [r3, #256] 111 .else 112 vld1.u8 d20, [r3]! 113 vld1.u8 d21, [r4]! 114 pld [r3, #128] 115 pld [r4, #128] 116 .endif 117 118 \kernel 119 120 subs r2, #16 121 122 vst4.u8 {d0,d2,d4,d6}, [r0]! 123 vst4.u8 {d1,d3,d5,d7}, [r0]! 124 125 bhs 1b 126 1272: adds r2, #16 128 beq 2f 129 130 /* To handle the tail portion of the data (something less than 16 131 * bytes) load small power-of-two chunks into working registers. It 132 * doesn't matter where they end up in the register; the same process 133 * will store them back out using the same positions and the 134 * interaction between neighbouring pixels is constrained to odd 135 * boundaries where the load operations don't interfere. 136 */ 137 vmov.i8 q8, #0 138 vmov.i8 q10, #0 139 140 tst r2, #8 141 beq 1f 142 vld1.u8 d17, [r1]! 143 .if \interleaved 144 vld1.u8 d21, [r3]! 145 .else 146 vld1.u32 d20[1], [r3]! 147 vld1.u32 d21[1], [r4]! 148 .endif 149 1501: tst r2, #4 151 beq 1f 152 vld1.u32 d16[1], [r1]! 153 .if \interleaved 154 vld1.u32 d20[1], [r3]! 155 .else 156 vld1.u16 d20[1], [r3]! 157 vld1.u16 d21[1], [r4]! 158 .endif 1591: tst r2, #2 160 beq 1f 161 vld1.u16 d16[1], [r1]! 162 .if \interleaved 163 vld1.u16 d20[1], [r3]! 164 .else 165 vld1.u8 d20[1], [r3]! 166 vld1.u8 d21[1], [r4]! 167 .endif 1681: tst r2, #1 169 beq 1f 170 vld1.u8 d16[1], [r1]! 171 .if \interleaved 172 vld1.u16 d20[0], [r3]! 173 .else 174 vld1.u8 d20[0], [r3]! 175 vld1.u8 d21[0], [r4]! 176 .endif 177 178 /* One small impediment in the process above is that some of the load 179 * operations can't perform byte-wise structure deinterleaving at the 180 * same time as loading only part of a register. So the data is loaded 181 * linearly and unpacked manually at this point if necessary. 182 */ 1831: vuzp.8 d16, d17 184 .if \interleaved 185 vuzp.8 d20, d21 186 .if \swapuv 187 vswp d20, d21 188 .endif 189 .endif 190 191 \kernel 192 193 /* As above but with the output; structured stores for partial vectors 194 * aren't available, so the data is re-packed first and stored linearly. 195 */ 196 vzip.8 q0, q2 197 vzip.8 q1, q3 198 vzip.8 q0, q1 199 vzip.8 q2, q3 200 2011: tst r2, #8 202 beq 1f 203 vst1.u8 {d4,d5,d6,d7}, [r0]! 204 2051: tst r2, #4 206 beq 1f 207 vst1.u8 {d2,d3}, [r0]! 2081: tst r2, #2 209 beq 1f 210 vst1.u8 d1, [r0]! 2111: tst r2, #1 212 beq 2f 213 vst1.u32 d0[1], [r0]! 2142: 215.endm 216 217 218/* void rsdIntrinsicYuv2_K( 219 * void *out, // r0 220 * void const *yin, // r1 221 * void const *uin, // r2 222 * void const *vin, // r3 223 * size_t xstart, // [sp] 224 * size_t xend); // [sp+#4] 225 */ 226ENTRY(rsdIntrinsicYuv2_K) 227 push {r4,r5} 228 ldr r5, [sp, #8] 229 mov r4, r3 230 mov r3, r2 231 ldr r2, [sp, #12] 232 233 add r0, r5, LSL #2 234 add r1, r5 235 add r3, r5, LSR #1 236 add r4, r5, LSR #1 237 sub r2, r5 238 239 vpush {d8-d15} 240 241 wrap_line yuvkern, 0 242 243 vpop {d8-d15} 244 pop {r4,r5} 245 bx lr 246END(rsdIntrinsicYuv2_K) 247 248/* void rsdIntrinsicYuv_K( 249 * void *out, // r0 250 * void const *yin, // r1 251 * void const *uvin, // r2 252 * size_t xstart, // r3 253 * size_t xend); // [sp] 254 */ 255ENTRY(rsdIntrinsicYuv_K) 256 push {r4,r5} 257 bic r4, r3, #1 258 add r3, r2, r4 259 ldr r2, [sp, #8] 260 261 add r0, r4, LSL #2 262 add r1, r4 263 sub r2, r4 264 265 vpush {d8-d15} 266 267 wrap_line yuvkern, 1, 1 268 269 vpop {d8-d15} 270 pop {r4,r5} 271 bx lr 272END(rsdIntrinsicYuv_K) 273 274/* void rsdIntrinsicYuvR_K( 275 * void *out, // r0 276 * void const *yin, // r1 277 * void const *uvin, // r2 278 * size_t xstart, // r3 279 * size_t xend); // [sp] 280 */ 281ENTRY(rsdIntrinsicYuvR_K) 282 push {r4,r5} 283 bic r4, r3, #1 284 add r3, r2, r4 285 ldr r2, [sp, #8] 286 287 add r0, r4, LSL #2 288 add r1, r4 289 sub r2, r4 290 291 vpush {d8-d15} 292 293 wrap_line yuvkern, 1 294 295 vpop {d8-d15} 296 pop {r4,r5} 297 bx lr 298END(rsdIntrinsicYuvR_K) 299