1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define END(f) .size f, .-f; 19 20/* Perform the actual YuvToRGB conversion in a macro, from register to 21 * register. This macro will be called from within several different wrapper 22 * variants for different data layouts. Y data starts with the even and odd 23 * bytes split into the low parts of v8 and v9 respectively. U and V are in 24 * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7 25 * are pre-loaded with a constant 0xff alpha channel. 26 * 27 * The complicated arithmetic is the result of refactoring the original 28 * equations to avoid 16-bit overflow without losing any precision. 29 */ 30.macro yuvkern, regu=v10, regv=v11 31 /* v0 out R_lo / even R_lo accumulator 32 * v1 out G_lo / even G_lo accumulator 33 * v2 out B_lo / even B_lo accumulator 34 * v3 out A_lo / const 0xff*ff 35 * v4 out R_hi / even R_hi accumulator 36 * v5 out G_hi / even G_hi accumulator 37 * v6 out B_hi / even B_hi accumulator 38 * v7 out A_hi / const 0xff*ff 39 * v8 even Y / G_lo luma tmp 40 * v9 odd Y / G_lo luma tmp 41 * \regu in U 42 * \regv in V 43 * v12 R_lo luma tmp 44 * v13 B_lo luma tmp 45 * v14 R_hi luma tmp 46 * v15 B_hi luma tmp 47 * v16 odd R_lo accumulator 48 * v17 odd G_lo accumulator 49 * v18 odd B_lo accumulator 50 * v19 multiplier extra bits low 51 * v20 odd R_hi accumulator 52 * v21 odd G_hi accumulator 53 * v22 odd B_hi accumulator 54 * v23 multiplier extra bits high 55 * v24 constant 149 56 * v25 constant 50 57 * v26 constant 104 58 * v27 constant 204 59 * v28 constant 254 60 * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 61 * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 62 * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1) 63 */ 64 65 umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149 66 umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149 67 umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149 68 umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149 69 70 umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104 71 umlal v8.8h, \regv\().8b, v26.8b 72 umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104 73 umlal2 v9.8h, \regv\().16b, v26.16b 74 75 ushr v19.16b, \regv\().16b, #1 76 uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1) 77 uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1) 78 79 uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1) 80 uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1) 81 82 ushll v19.8h, \regu\().8b, #2 83 ushll2 v23.8h, \regu\().16b, #2 84 add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2) 85 add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2) 86 87 add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2) 88 add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2) 89 90 umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204 91 umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254 92 93 umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204 94 umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254 95 96 uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1 97 uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1 98 uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 99 uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 100 uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1 101 uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1 102 103 uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1 104 uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1 105 uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 106 uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) 107 uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1 108 uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1 109 110 uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 111 uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 112 uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) 113 uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2) 114 uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 115 uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 116 117 uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 118 uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) 119 uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi) 120 uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi) 121 uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 122 uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) 123 124 uqrshrn v0.8b, v0.8h, #6 125 uqrshrn v16.8b, v16.8h, #6 126 uqrshrn v1.8b, v1.8h, #7 127 uqrshrn v17.8b, v17.8h, #7 128 uqrshrn v2.8b, v2.8h, #6 129 uqrshrn v18.8b, v18.8h, #6 130 131 uqrshrn v4.8b, v4.8h, #6 132 uqrshrn v20.8b, v20.8h, #6 133 uqrshrn v5.8b, v5.8h, #7 134 uqrshrn v21.8b, v21.8h, #7 135 uqrshrn v6.8b, v6.8h, #6 136 uqrshrn v22.8b, v22.8h, #6 137 138 zip1 v0.16b, v0.16b, v16.16b 139 zip1 v1.16b, v1.16b, v17.16b 140 zip1 v2.16b, v2.16b, v18.16b 141 142 zip1 v4.16b, v4.16b, v20.16b 143 zip1 v5.16b, v5.16b, v21.16b 144 zip1 v6.16b, v6.16b, v22.16b 145.endm 146 147/* Define the wrapper code which will load and store the data, iterate the 148 * correct number of times, and safely handle the remainder at the end of the 149 * loop. Some sections of code are switched out depending on the data packing 150 * being handled. 151 */ 152.macro wrap_line kernel, interleaved=0, swapuv=0 153 movi v24.16b, #149 154 movi v25.16b, #50 155 movi v26.16b, #104 156 movi v27.16b, #204 157 movi v28.16b, #254 158 mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) 159 dup v29.8h, w5 160 mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) 161 dup v30.8h, w5 162 mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) 163 dup v31.8h, w5 164 165 movi v3.16b, #0xff 166 movi v7.16b, #0xff 167 168 subs x2, x2, #32 169 bhs 1f 170 b 2f 171 172 .align 4 1731: ld2 {v8.16b,v9.16b}, [x1], #32 174 .if \interleaved 175 ld2 {v10.16b,v11.16b}, [x3], #32 176 .else 177 ld1 {v10.16b}, [x3], #16 178 ld1 {v11.16b}, [x4], #16 179 .endif 180 181 .if \swapuv 182 \kernel regu=v11, regv=v10 183 .else 184 \kernel 185 .endif 186 187 subs x2, x2, #32 188 189 st4 {v0.16b - v3.16b}, [x0], #64 190 st4 {v4.16b - v7.16b}, [x0], #64 191 192 bhs 1b 193 1942: adds x2, x2, #32 195 beq 2f 196 197 /* To handle the tail portion of the data (something less than 32 198 * bytes) load small power-of-two chunks into working registers. It 199 * doesn't matter where they end up in the register; the same process 200 * will store them back out using the same positions and the 201 * interaction between neighbouring pixels is constrained to odd 202 * boundaries where the load operations don't interfere. 203 */ 204 movi v8.8b, #0 205 movi v9.8b, #0 206 movi v10.8b, #0 207 movi v11.8b, #0 208 209 tbz x2, #4, 1f 210 ld1 {v9.16b}, [x1], #16 211 .if \interleaved 212 ld1 {v11.16b}, [x3], #16 213 .else 214 ld1 {v10.d}[1], [x3], #8 215 ld1 {v11.d}[1], [x4], #8 216 .endif 2171: tbz x2, #3, 1f 218 ld1 {v8.d}[1], [x1], #8 219 .if \interleaved 220 ld1 {v10.d}[1], [x3], #8 221 .else 222 ld1 {v10.s}[1], [x3], #4 223 ld1 {v11.s}[1], [x4], #4 224 .endif 2251: tbz x2, #2, 1f 226 ld1 {v8.s}[1], [x1], #4 227 .if \interleaved 228 ld1 {v10.s}[1], [x3], #4 229 .else 230 ld1 {v10.h}[1], [x3], #2 231 ld1 {v11.h}[1], [x4], #2 232 .endif 2331: tbz x2, #1, 1f 234 ld1 {v8.h}[1], [x1], #2 235 .if \interleaved 236 ld1 {v10.h}[1], [x3], #2 237 .else 238 ld1 {v10.b}[1], [x3], #1 239 ld1 {v11.b}[1], [x4], #1 240 .endif 2411: tbz x2, #0, 1f 242 ld1 {v8.b}[1], [x1], #1 243 .if \interleaved 244 ld1 {v10.h}[0], [x3], #2 245 .else 246 ld1 {v10.b}[0], [x3], #1 247 ld1 {v11.b}[0], [x4], #1 248 .endif 249 250 /* One small impediment in the process above is that some of the load 251 * operations can't perform byte-wise structure deinterleaving at the 252 * same time as loading only part of a register. So the data is loaded 253 * linearly and unpacked manually at this point if necessary. 254 */ 2551: mov v12.16b, v8.16b 256 uzp1 v8.16b, v12.16b, v9.16b 257 uzp2 v9.16b, v12.16b, v9.16b 258 .if \interleaved 259 mov v12.16b, v10.16b 260 uzp1 v10.16b, v12.16b, v11.16b 261 uzp2 v11.16b, v12.16b, v11.16b 262 .endif 263 264 .if \swapuv 265 \kernel regu=v11, regv=v10 266 .else 267 \kernel 268 .endif 269 270 /* As above but with the output; structured stores for partial vectors 271 * aren't available, so the data is re-packed first and stored linearly. 272 */ 273 zip1 v16.16b, v0.16b, v2.16b 274 zip2 v18.16b, v0.16b, v2.16b 275 zip1 v17.16b, v1.16b, v3.16b 276 zip2 v19.16b, v1.16b, v3.16b 277 zip1 v0.16b, v16.16b, v17.16b 278 zip2 v1.16b, v16.16b, v17.16b 279 zip1 v2.16b, v18.16b, v19.16b 280 zip2 v3.16b, v18.16b, v19.16b 281 282 /* Luckily v4-v7 don't need to be unzipped because the complete set of 283 * four and can be stored using st4. */ 284 285 tbz x2, #4, 1f 286 st4 {v4.16b - v7.16b}, [x0], #64 2871: tbz x2, #3, 1f 288 st1 {v2.16b,v3.16b}, [x0], #32 2891: tbz x2, #2, 1f 290 st1 {v1.16b}, [x0], #16 2911: tbz x2, #1, 1f 292 st1 {v0.d}[1], [x0], #8 2931: tbz x2, #0, 2f 294 st1 {v0.s}[1], [x0], #4 2952: 296.endm 297 298 299/* void rsdIntrinsicYuv2_K( 300 * void *out, // x0 301 * void const *yin, // x1 302 * void const *uin, // x2 303 * void const *vin, // x3 304 * size_t xstart, // x4 305 * size_t xend); // x5 306 */ 307ENTRY(rsdIntrinsicYuv2_K) 308 lsr x6, x4, #1 309 add x0, x0, x4, LSL #2 310 add x1, x1, x4 311 add x4, x3, x6 312 add x3, x2, x6 313 sub x2, x5, x6, LSL #1 314 315 sub x6, sp, #32 316 sub sp, sp, #64 317 st1 {v8.1d - v11.1d}, [sp] 318 st1 {v12.1d - v15.1d}, [x6] 319 320 wrap_line yuvkern, 0 321 322 ld1 {v8.1d - v11.1d}, [sp], #32 323 ld1 {v12.1d - v15.1d}, [sp], #32 324 ret 325END(rsdIntrinsicYuv2_K) 326 327/* void rsdIntrinsicYuv_K( 328 * void *out, // x0 329 * void const *yin, // x1 330 * void const *uvin, // x2 331 * size_t xstart, // x3 332 * size_t xend); // x4 333 */ 334ENTRY(rsdIntrinsicYuv_K) 335 bic x5, x3, #1 336 add x0, x0, x5, LSL #2 337 add x1, x1, x5 338 add x3, x2, x5 339 sub x2, x4, x5 340 341 sub x5, sp, #32 342 sub sp, sp, #64 343 st1 {v8.1d - v11.1d}, [sp] 344 st1 {v12.1d - v15.1d}, [x5] 345 346 wrap_line yuvkern, 1, 1 347 348 ld1 {v8.1d - v11.1d}, [sp], #32 349 ld1 {v12.1d - v15.1d}, [sp], #32 350 ret 351END(rsdIntrinsicYuv_K) 352 353/* void rsdIntrinsicYuvR_K( 354 * void *out, // x0 355 * void const *yin, // x1 356 * void const *uvin, // x2 357 * size_t xstart, // x3 358 * size_t xend); // x4 359 */ 360ENTRY(rsdIntrinsicYuvR_K) 361 bic x5, x3, #1 362 add x0, x0, x5, LSL #2 363 add x1, x1, x5 364 add x3, x2, x5 365 sub x2, x4, x5 366 367 sub x5, sp, #32 368 sub sp, sp, #64 369 st1 {v8.1d - v11.1d}, [sp] 370 st1 {v12.1d - v15.1d}, [x5] 371 372 wrap_line yuvkern, 1 373 374 ld1 {v8.1d - v11.1d}, [sp], #32 375 ld1 {v12.1d - v15.1d}, [sp], #32 376 ret 377END(rsdIntrinsicYuvR_K) 378